# This is a config file for BeeGFS metadata nodes. # http://www.beegfs.com # --- [Table of Contents] --- # # 1) Settings # 2) Command Line Arguments # 3) Basic Settings Documentation # 4) Advanced Settings Documentation # # --- Section 1.1: [Basic Settings] --- # sysMgmtdHost = storeMetaDirectory = storeAllowFirstRunInit = true storeFsUUID = # # --- Section 1.2: [Advanced Settings] --- # connAuthFile = /etc/beegfs/conn.auth connDisableAuthentication = false connBacklogTCP = 128 connFallbackExpirationSecs = 900 connInterfacesFile = connMaxInternodeNum = 32 connMetaPort = 8005 connMgmtdPort = 8008 connPortShift = 0 connNetFilterFile = connUseRDMA = true connRDMATypeOfService = 0 connTcpOnlyFilterFile = logType = syslog logLevel = 3 logNoDate = false logNumLines = 50000 logNumRotatedFiles = 5 logStdFile = /var/log/beegfs-meta.log runDaemonized = true storeClientXAttrs = false storeClientACLs = false storeUseExtendedAttribs = true sysTargetAttachmentFile = sysTargetOfflineTimeoutSecs = 180 sysAllowUserSetPattern = false tuneBindToNumaZone = tuneNumStreamListeners = 1 tuneNumWorkers = 0 tuneTargetChooser = randomized tuneUseAggressiveStreamPoll = false tuneUsePerUserMsgQueues = false # # --- Section 2: [Command Line Arguments] --- # # Use the command line argument "cfgFile=/etc/anotherconfig.conf" to # specify a different config file for beegfs-meta. # All other options in this file can also be used as command line # arguments, overriding the corresponding config file values. # # --- Section 3: [Basic Settings Documentation] --- # # [sysMgmtdHost] # Hostname (or IP) of the host running the management service. # (See also "connMgmtdPort") # Default: # [storeMetaDirectory] # The absolute path and name of a directory where the file system can store its # metadata. # Default: # [storeFsUUID] # Requires the underlying file system of the metadata directory to have the same # UUID as set here. This prevents the meta node from accidentaly starting from the # wrong device, e.g. when it is not properly mounted. To find the UUID to # put here, you can, for example, use blkid: # # blkid -s UUID # # This will output all devices on the host with their file systems UUID (if there # is one). Choose the correct one and copy it here. This command needs to be run # as root. # # If left empty, the check is skipped. It is highly recommended to enable this check # after installation to prevent data corruption. # Default: # [storeAllowFirstRunInit] # Enables or disables daemon startup with an uninitialized storage directory. # This can be used to make sure that the daemon does not run when the storage # partition is not mounted (e.g. because it needs repair after a power outage). # Note: This setting must be enabled during first startup of the daemon, but # may be disabled afterwards. # Default: true # # --- Section 4: [Advanced Settings Documentation] --- # # # --- Section 4.1: [Connections & Communication] --- # # [connAuthFile] # The path to a file that contains a shared secret for connection based # authentication. Only peers that use the same shared secret will be able to # connect. # Default: # [connDisableAuthentication] # If set to true, explicitly disables connection authentication and allow the # service to run without a connAuthFile. Running BeeGFS without connection # authentication is considered insecure and is not recommended. # Default: false # [connBacklogTCP] # The TCP listen backlog. # Default: 128 # [connFallbackExpirationSecs] # The time in seconds after which a connection to a fallback interface expires. # When a fallback connection expires, the system will try to establish a new # connection to the other hosts primary interface (falling back to another # interface again if necessary). # Note: The priority of node interfaces can be configured using the # "connInterfacesFile" parameter. # Default: 900 # [connInterfacesFile] # The path to a text file that specifies the names of the interfaces which # may be used for communication by other nodes. One interface per line. The # line number also defines the priority of an interface. # Example: "ib0" in the first line, "eth0" in the second line. # Values: This setting is optional. If unspecified, all available interfaces # will be published and priorities will be assigned automatically. # Note: This information is sent to other hosts to inform them about possible # communication paths. See connRestrictOutboundInterfaces for this # configuration's potential effect on outbound connections. # Default: # [connInterfacesList] # Comma-separated list of interface names. Performs the same function as # connInterfacesFile. # Default: # [connRestrictOutboundInterfaces] # The default behavior of BeeGFS is to use any available network interface # to establish an outbound connection to a node, according to the TCP/IP # configuration of the operating system. When connRestrictOutboundInterfaces # is set to true, the network interfaces used for outbound connections are # limited to the values specified by connInterfacesFile or connInterfacesList. # The operating system routing tables are consulted to determine which # interface to use for a particular node's IP address. If there is no # route from the configured interfaces that is suitable for a node's IP # addresses then the connection will fail to be established. # Default: false # [connNoDefaultRoute] # When connRestrictOutboundInterfaces is true, the routing logic would use # the default route for a Node's IP address when no specific route for that # address is found in the routing tables. This can be problematic during a # failure situation, as the default route is not appropriate to use for a # subnet that is accessible from an interface that has failed. # connNoDefaultRoute is a comma-separated list of CIDRs that should never # be accessed via the default route. # Default: 0.0.0.0/0. This prevents the default route from ever being used. # [connMaxInternodeNum] # The maximum number of simultaneous connections to the same node. # Default: 32 # [connMetaPort] # The UDP and TCP port of the metadata node. # Default: 8005 # [connMgmtdPort] # The UDP and TCP port of the management node. # Default: 8008 # [connPortShift] # Shifts all following UDP and TCP ports according to the specified value. # Intended to make port configuration easier in case you do not want to # configure each port individually. # Default: 0 # [connNetFilterFile] # The path to a text file that specifies allowed IP subnets, which may be used # for outgoing communication. One subnet per line in classless notation (IP # address and number of significant bits). # Example: "192.168.10.0/24" in the first line, "192.168.20.0/24" in the second # line. # Values: This setting is optional. If unspecified, all addresses are allowed # for outgoing communication. # Default: # [connTCPRcvBufSize], [connUDPRcvBufSize] # Sets the size for TCP and UDP socket receive buffers (SO_RCVBUF). The maximum # allowed value is determined by sysctl net.core.rmem_max. This value is # ignored if it is less than the default value determined by # net.core.rmem_default. # For legacy reasons, the default value 0 indicates that the buffer size is set # to connRDMABufNum * connRDMABufSize. # -1 indicates that the buffer size should be left at the system default. # Default: 0 # [connUseRDMA] # Enables the use of Remote Direct Memory Access (RDMA) for Infiniband. # This setting only has effect if libbeegfs-ib is installed. # Default: true # [connRDMABufNum], [connRDMABufSize] # Infiniband RDMA buffer settings. # connRDMABufSize is the maximum size of a buffer (in bytes) that will be sent # over the network; connRDMABufNum is the number of available buffers that can # be in flight for a single connection. These client settings are also applied # on the server side for each connection. # Note: RAM usage per connection is connRDMABufSize x connRDMABufNum x 2. Keep # resulting RAM usage (x connMaxInternodeNum x number_of_clients) on the # server in mind when increasing these values. # Note: The client needs to allocate physically contiguous pages for # connRDMABufSize, so this setting shouldn't be higher than a few kbytes. # Default: 8192, 70 # [connRDMATypeOfService] # Infiniband provides the option to set a type of service for an application. # This type of service can be used by your subnet manager to provide Quality of # Service functionality (e.g. setting different service levels). # In openSM the service type will be mapped to the parameter qos-class, which # can be handled in your QoS configuration. # See # www.openfabrics.org/downloads/OFED/ofed-1.4/OFED-1.4-docs/ # QoS_management_in_OpenSM.txt # for more information on how to configure openSM for QoS. # This parameter sets the type of service for all outgoing connections of this # daemon. # Default: 0 (Max: 255) # [connTcpOnlyFilterFile] # The path to a text file that specifies IP address ranges to which no RDMA # connection should be established. This is useful e.g. for environments where # all hosts support RDMA, but some hosts cannot connect via RDMA to some other # hosts. # Example: "192.168.10.0/24" in the first line, "192.168.20.0/24" in the second # line. # Values: This setting is optional. # Default: # [connMessagingTimeouts] # These constants are used to set some of the connection timeouts for sending # and receiving data between services in the cluster. They used to be hard-coded # (CONN_LONG_TIMEOUT, CONN_MEDIUM_TIMEOUT and CONN_SHORT_TIMEOUT) but are now # made configurable for experimentation purposes. # This option takes three integer values of milliseconds, separated by a comma # in the order long, medium, short. # WARNING: This is an EXPERIMENTAL configuration option that should not be # changed in production environments unless properly tested and validated. # Some configurations can lead to service lockups and other subtle issues. # Please make sure that you know exactly what you are doing and properly # test any changes you make. # Default: 600000,90000,30000 # [connRDMATimeouts] # These constants are used to set some of the timeouts for sending and receiving # data between services in the cluster via RDMA. They used to be # hard-coded IBVSOCKET_CONN_TIMEOUT_MS, IBVSOCKET_FLOWCONTROL_ONSEND_TIMEOUT_MS # and a 10000 literal for poll timeout but are now made configurable for # experimentation purposes. # This option takes three integer values of milliseconds, separated by a comma # in the order connectMS, flowSendMS and pollMS. # WARNING: This is an EXPERIMENTAL configuration option that should not be # changed in production environments unless properly tested and validated. # Some configurations can lead to service lockups and other subtle issues. # Please make sure that you know exactly what you are doing and properly # test any changes you make. # Default: 3000,180000,7500 # # --- Section 4.2: [Logging] --- # # [logType] # Defines the logger type. This can either be "syslog" to send log messages to # the general system logger or "logfile". If set to logfile logs will be written # to logStdFile. # Default: logfile # [logLevel] # Defines the amount of output messages. The higher this level, the more # detailed the log messages will be. # Note: Levels above 3 might decrease performance. # Default: 3 (Max: 5) # [logNoDate] # Defines whether "date & time" (=false) or the current "time only" (=true) # should be logged. # Default: false # [logNumLines] # The maximum number of lines per log file. # Default: 50000 # [logNumRotatedFiles] # The number of old files to keep when "logNumLines" is reached and the log file # is rewritten (log rotation). # Default: 5 # [logStdFile] # The path and filename of the log file for standard log messages. The parameter # will be considered only if logType value is not equal to syslog. If no name # is specified, the messages will be written to the console. # Default: /var/log/beegfs-meta.log # # --- Section 4.3: [Startup] --- # # [runDaemonized] # Detach the process from its parent (and from stdin/-out/-err). # Default: true # # --- Section 4.4: [Storage] --- # # [storeClientXAttrs] # Enables client-side extended attributes. # Note: Can only be enabled if the underlying file system supports extended # attributes. # Note: This setting has to be explicitly enabled on the clients as well. # Default: false # [storeClientACLs] # Enables the handling and storage of client-side access control lists. # As ACLs are stored as extended attributes, this setting mainly concerns the # enforcement and server-side propagation of directory default ACLs. # Note: This setting can only be enabled if storeClientXAttrs is set to true. # Note: This setting has to be explicitly enabled on all clients as well. # Note: Enabling this setting can affect metadata performance. # Default: false # [storeUseExtendedAttribs] # Controls whether BeeGFS metadata is stored as normal file contents (=false) # or as extended attributes (=true) on the underlying files system. Depending on # the type and version of your underlying local file system, extended attributes # typically are significantly faster. # Note: This setting can only be configured at first startup and cannot be # changed afterwards. # Default: true # # --- Section 4.5: [System Settings] --- # # [sysTargetAttachmentFile] # This file provides a specification of which targets should be grouped within # the same domain for randominternode target chooser. This is useful # e.g. if randominternode is used with multiple storage daemon # instances running on the same physical hosts when files should be striped # across different physical hosts. # Format: Line-separated = definition. # Example: "101=1" in first line, "102=1" in second line, "201=2" in third # line to define that targets "101" and "102" are part of the same # domain "1", while target "201" is part of a different domain "2". The # domain IDs in this file are arbitrary values in range 1..65535, the # targetIDs are actual targetIDs as in "beegfs-ctl --listtargets". # Default: # [sysTargetOfflineTimeoutSecs] # Timeout until the metadata nodes and storage targets are considered offline # when no target state updates can be fetched from that node. # Note: This must be the same value as in the /etc/beegfs/beegfs-mgmtd.conf on # the management node. # Values: time in seconds # Default: 180 # [sysAllowUserSetPattern] # If set to true, non-privileged users are allowed to modify stripe pattern # settings for directories they own. # Default: false # [sysFileEventLogTarget] # If set, the metadata server will log modification events (which it receives # from clients) to a Unix Socket specified here. External tools may listen on # this socket and process the information. # Note: Each event will be logged in the following format: # droppedSeqNo (64bit) - missedSeqNo (64bit) - eventType (32bit) - ModifiedPath # Increased dropped sequence numbers indicate communication errors. Increased # missed sequence numbers indicate that a event could not be properly reported. # The following event types are possible: # 0(file contents flushed), 1(truncate), 2(set attribute), 3(file closed), # 4(create), 5(mkdir), 6(mknode), 7(create symlink), 8(rmdir), 9(unlink), # 10(create hardlink), 11(rename), 12(read) # Default: # Example: sysFileEventLogTarget = unix:/run/beegfs/eventlog # [sysFileEventPersistDirectory] # If set, the metadata server will persist modification events to this # directory. If unset, will persist to "eventq" subdirectory under # metadata-root. # When the metadata server starts up, it will try to create that directory # (non-recursive mkdir()) and initialize a new event persist store in this # directory. # If creating the directory fails with EEXIST (directory exists) it will assume # an existing persist store and try to load it. # Default: (storeMetaDirectory + "/eventq") # [sysFileEventPersistSize] # If event logging is enabled (see sysFileEventLogTarget), this control # explicitly sets the size (in bytes) for creating the chunk-store file in the # eventq directory (sysFileEventPersistDirectory). # The chunk-store is a file containing a ringbuffer where events get # persisted. The events will eventually be delivered to downstream services # (for example bee-watch, hive-index). # Note that this value has no effect when loading an existing queue directory. # Default: 0 (use internal defaults / guesswork) # Example: sysFileEventPersistSize = 2g # 2 Gigabytes # Note: the value will be rounded up to the next power of 2. # # --- Section 4.6: [Tuning] --- # # [tuneBindToNumaZone] # Defines the zero-based NUMA zone number to which all threads of this process # should be bound. If unset, all available CPU cores may be used. # Zone binding is especially useful if the corresponding devices (e.g. storage # controller and network card) are also attached to the same zone. # Note: The Linux kernel shows NUMA zones at /sys/devices/system/node/nodeXY # Default: # [tuneNumStreamListeners] # The number of threads waiting for incoming data events. Connections with # incoming data will be handed over to the worker threads for actual message # processing. # Default: 1 # [tuneNumWorkers] # The number of worker threads. Higher number of workers allows the server to # handle more client requests in parallel. On dedicated metadata servers, this # is typically set to a value between four and eight times the number of CPU # cores. # Note: 0 means use twice the number of CPU cores (but at least 4). # Default: 0 # [tuneTargetChooser] # The algorithm to choose storage targets for file creation. # Values: # * randomized: choose targets in a random fashion. # * roundrobin: choose targets in a deterministic round-robin fashion. # (Use this only for benchmarking of large-file streaming throughput.) # * randomrobin: randomized round-robin; choose targets in a deterministic # round-robin fashion, but random shuffle the result targets list. # * randominternode: choose random targets that are assigned to different # storage nodeIDs. (See sysTargetAttachmentFile if multiple storage # storage daemon instances are running on the same physical host.) # Note: Only the randomized chooser honors client's preferred nodes/targets # settings. # Default: randomized # [tuneUseAggressiveStreamPoll] # If set to true, the StreamListener component, which waits for incoming # requests, will keep actively polling for events instead of sleeping until # an event occurs. Active polling will reduce latency for processing of # incoming requests at the cost of higher CPU usage. # Default: false # [tuneUsePerUserMsgQueues] # If set to true, per-user queues will be used to decide which of the pending # requests is handled by the next available worker thread. If set to false, # a single queue will be used and incoming requests will be processed in # first-come, first-served order. # Per-user queues are intended to improve fairness in multi-user environments. # Default: false # [tuneWorkerBufSize] # The buffer size, which is allocated twice by each worker thread for IO and # network data buffering. # Note: For optimal performance, this value must be at least 128k. # Default: 1m # [tuneNumResyncSlaves] # The number of threads used to perform the bulk synchronizations for a buddy # mirror resync. # Default: 12 # # --- Section 4.7: [Quota settings] --- # # [quotaEnableEnforcement] # Enables enforcement of user and group quota limits by periodically checking # if the limits are exceeded. # Note: This uses quota information provided by the underlying local file # systems of the storage targets. # Note: Set quota limits with "beegfs-ctl --setquota". # Note: If this option is true, performance might be slightly decreased due to # extra information tracking. # Note: Must be set to the same value in storage servers and mgmtd to be # effective. # Default: false # # --- Section 5: [Expert options] --- # # [storeSelfHealEmptyFiles] # Delete metadata entries with no content and handle them as though they had # not existed. Metadata entries with no content can be created by backup # software that has incorrectly saved or restored metadata. # Default: true # [tuneNumCommSlaves] # Number of threads dedicated to parallel communication with other nodes. # Default: 2 * tuneNumWorkers # [tuneCommSlaveBufSize] # Buffer size used by communication threads, analogous to tuneWorkerBufSize. # Default: 1m # [tuneDefaultChunkSize], [tuneDefaultNumStripeTargets] # Chunk size and number of targets to use when creating the root directory. # Files and directories inherit these setting from their parents. # Default: 512k, 4 # [tuneProcessFDLimit] # Sets the maximum number of files the server can open. If the process rlimit # is already larger than this number the limit will not be decreased. # Default: 50000 # [tuneWorkerNumaAffinity] # Distributes worker threads equally among NUMA nodes on the system when set. # Default: false # [tuneListenerNumaAffinity] # Distributes listener threads equally among NUMA nodes on the system when set. # Default: false # [tuneListenerPrioShift] # Applies a niceness offset to listener threads. Negative values will decrease # niceness (increse priority), positive values will increase niceness (decrease # priority). # Default: -1 # [tuneDirMetadataCacheLimit] # Number of recently used directory structures to keep in memory. # Increasing this value may reduce memory allocations and disk I/O. # Default: 1024 # [tuneLockGrantWaitMS], [tuneLockGrantNumRetries] # Acknowledgement wait parameters for lock grant messages. # Locks that are granted asynchronously (ie a client is waiting on the lock) # notify waiting clients with UDP packets. For each waiter a notification # packet is sent and the server waits for tuneLockGrantWaitMS to receive an # acknowledgement from the client. This process is repeated up to # tuneLockGrantNumRetries times, # Default: 333, 15 # [tuneRotateMirrorTargets] # Choose mirror targets for RAID10 patterns by rotating the selected targets. # Default: false # [tuneEarlyUnlinkResponse] # Respond to unlink messages before chunk files have been unlinked. # Default: true # [tuneMirrorTimestamps] # When buddy mirroring, mirror timestamps as exactly as possible. When this is # set to `false` timestamps of mirrored files may be incorrect after a failover # has occured. Disabling timestamp mirroring gives a slight performance boost. # Default: true # [tuneDisposalGCPeriod] # If > 0, disposal files will not be removed instantly. Insead a garbage collector # will run on each meta node. This sets the Wait time in seconds between runs. # Default: 0 # [quotaEarlyChownResponse] # Respond to client chown() requests before chunk files have been changed. # Quota relies on chunk files having the owner and group information stored in # metadata. Therefore, setting this to true creates a short time window after # a chown where the application and the servers have a different view on quota. # Default: true # [pidFile] # Creates a PID file for the daemon when set. Set by init scripts. # Default: