diff options
284 files changed, 11491 insertions, 5171 deletions
diff --git a/Documentation/ABI/testing/securityfs-secrets-coco b/Documentation/ABI/testing/securityfs-secrets-coco new file mode 100644 index 000000000000..f2b6909155f9 --- /dev/null +++ b/Documentation/ABI/testing/securityfs-secrets-coco @@ -0,0 +1,51 @@ +What: security/secrets/coco +Date: February 2022 +Contact: Dov Murik <dovmurik@linux.ibm.com> +Description: + Exposes confidential computing (coco) EFI secrets to + userspace via securityfs. + + EFI can declare memory area used by confidential computing + platforms (such as AMD SEV and SEV-ES) for secret injection by + the Guest Owner during VM's launch. The secrets are encrypted + by the Guest Owner and decrypted inside the trusted enclave, + and therefore are not readable by the untrusted host. + + The efi_secret module exposes the secrets to userspace. Each + secret appears as a file under <securityfs>/secrets/coco, + where the filename is the GUID of the entry in the secrets + table. This module is loaded automatically by the EFI driver + if the EFI secret area is populated. + + Two operations are supported for the files: read and unlink. + Reading the file returns the content of secret entry. + Unlinking the file overwrites the secret data with zeroes and + removes the entry from the filesystem. A secret cannot be read + after it has been unlinked. + + For example, listing the available secrets:: + + # modprobe efi_secret + # ls -l /sys/kernel/security/secrets/coco + -r--r----- 1 root root 0 Jun 28 11:54 736870e5-84f0-4973-92ec-06879ce3da0b + -r--r----- 1 root root 0 Jun 28 11:54 83c83f7f-1356-4975-8b7e-d3a0b54312c6 + -r--r----- 1 root root 0 Jun 28 11:54 9553f55d-3da2-43ee-ab5d-ff17f78864d2 + -r--r----- 1 root root 0 Jun 28 11:54 e6f5a162-d67f-4750-a67c-5d065f2a9910 + + Reading the secret data by reading a file:: + + # cat /sys/kernel/security/secrets/coco/e6f5a162-d67f-4750-a67c-5d065f2a9910 + the-content-of-the-secret-data + + Wiping a secret by unlinking a file:: + + # rm /sys/kernel/security/secrets/coco/e6f5a162-d67f-4750-a67c-5d065f2a9910 + # ls -l /sys/kernel/security/secrets/coco + -r--r----- 1 root root 0 Jun 28 11:54 736870e5-84f0-4973-92ec-06879ce3da0b + -r--r----- 1 root root 0 Jun 28 11:54 83c83f7f-1356-4975-8b7e-d3a0b54312c6 + -r--r----- 1 root root 0 Jun 28 11:54 9553f55d-3da2-43ee-ab5d-ff17f78864d2 + + Note: The binary format of the secrets table injected by the + Guest Owner is described in + drivers/virt/coco/efi_secret/efi_secret.c under "Structure of + the EFI secret area". diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst index f4efd6897b09..b34990c7c377 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst @@ -973,7 +973,7 @@ The ``->dynticks`` field counts the corresponding CPU's transitions to and from either dyntick-idle or user mode, so that this counter has an even value when the CPU is in dyntick-idle mode or user mode and an odd value otherwise. The transitions to/from user mode need to be counted -for user mode adaptive-ticks support (see timers/NO_HZ.txt). +for user mode adaptive-ticks support (see Documentation/timers/no_hz.rst). The ``->rcu_need_heavy_qs`` field is used to record the fact that the RCU core code would really like to see a quiescent state from the diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst index 6f89cf1e567d..c9c957c85bac 100644 --- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst +++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst @@ -406,7 +406,7 @@ In earlier implementations, the task requesting the expedited grace period also drove it to completion. This straightforward approach had the disadvantage of needing to account for POSIX signals sent to user tasks, so more recent implemementations use the Linux kernel's -`workqueues <https://www.kernel.org/doc/Documentation/core-api/workqueue.rst>`__. +workqueues (see Documentation/core-api/workqueue.rst). The requesting task still does counter snapshotting and funnel-lock processing, but the task reaching the top of the funnel lock does a diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index 45278e2974c0..04ed8bf27a0e 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -370,8 +370,8 @@ pointer fetched by rcu_dereference() may not be used outside of the outermost RCU read-side critical section containing that rcu_dereference(), unless protection of the corresponding data element has been passed from RCU to some other synchronization -mechanism, most commonly locking or `reference -counting <https://www.kernel.org/doc/Documentation/RCU/rcuref.txt>`__. +mechanism, most commonly locking or reference counting +(see ../../rcuref.rst). .. |high-quality implementation of C11 memory_order_consume [PDF]| replace:: high-quality implementation of C11 ``memory_order_consume`` [PDF] .. _high-quality implementation of C11 memory_order_consume [PDF]: http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf @@ -2654,6 +2654,38 @@ synchronize_rcu(), and rcu_barrier(), respectively. In three APIs are therefore implemented by separate functions that check for voluntary context switches. +Tasks Rude RCU +~~~~~~~~~~~~~~ + +Some forms of tracing need to wait for all preemption-disabled regions +of code running on any online CPU, including those executed when RCU is +not watching. This means that synchronize_rcu() is insufficient, and +Tasks Rude RCU must be used instead. This flavor of RCU does its work by +forcing a workqueue to be scheduled on each online CPU, hence the "Rude" +moniker. And this operation is considered to be quite rude by real-time +workloads that don't want their ``nohz_full`` CPUs receiving IPIs and +by battery-powered systems that don't want their idle CPUs to be awakened. + +The tasks-rude-RCU API is also reader-marking-free and thus quite compact, +consisting of call_rcu_tasks_rude(), synchronize_rcu_tasks_rude(), +and rcu_barrier_tasks_rude(). + +Tasks Trace RCU +~~~~~~~~~~~~~~~ + +Some forms of tracing need to sleep in readers, but cannot tolerate +SRCU's read-side overhead, which includes a full memory barrier in both +srcu_read_lock() and srcu_read_unlock(). This need is handled by a +Tasks Trace RCU that uses scheduler locking and IPIs to synchronize with +readers. Real-time systems that cannot tolerate IPIs may build their +kernels with ``CONFIG_TASKS_TRACE_RCU_READ_MB=y``, which avoids the IPIs at +the expense of adding full memory barriers to the read-side primitives. + +The tasks-trace-RCU API is also reasonably compact, +consisting of rcu_read_lock_trace(), rcu_read_unlock_trace(), +rcu_read_lock_trace_held(), call_rcu_tasks_trace(), +synchronize_rcu_tasks_trace(), and rcu_barrier_tasks_trace(). + Possible Future Changes ----------------------- diff --git a/Documentation/RCU/arrayRCU.rst b/Documentation/RCU/arrayRCU.rst index 4051ea3871ef..a5f2ff8fc54c 100644 --- a/Documentation/RCU/arrayRCU.rst +++ b/Documentation/RCU/arrayRCU.rst @@ -33,8 +33,8 @@ Situation 1: Hash Tables Hash tables are often implemented as an array, where each array entry has a linked-list hash chain. Each hash chain can be protected by RCU -as described in the listRCU.txt document. This approach also applies -to other array-of-list situations, such as radix trees. +as described in listRCU.rst. This approach also applies to other +array-of-list situations, such as radix trees. .. _static_arrays: diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index f4545b7c9a63..42cc5d891bd2 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -140,8 +140,7 @@ over a rather long period of time, but improvements are always welcome! prevents destructive compiler optimizations. However, with a bit of devious creativity, it is possible to mishandle the return value from rcu_dereference(). - Please see rcu_dereference.txt in this directory for - more information. + Please see rcu_dereference.rst for more information. The rcu_dereference() primitive is used by the various "_rcu()" list-traversal primitives, such @@ -151,7 +150,7 @@ over a rather long period of time, but improvements are always welcome! primitives. This is particularly useful in code that is common to readers and updaters. However, lockdep will complain if you access rcu_dereference() outside - of an RCU read-side critical section. See lockdep.txt + of an RCU read-side critical section. See lockdep.rst to learn what to do about this. Of course, neither rcu_dereference() nor the "_rcu()" @@ -323,7 +322,7 @@ over a rather long period of time, but improvements are always welcome! primitives when the update-side lock is held is that doing so can be quite helpful in reducing code bloat when common code is shared between readers and updaters. Additional primitives - are provided for this case, as discussed in lockdep.txt. + are provided for this case, as discussed in lockdep.rst. One exception to this rule is when data is only ever added to the linked data structure, and is never removed during any @@ -480,4 +479,4 @@ over a rather long period of time, but improvements are always welcome! both rcu_barrier() and synchronize_rcu(), if necessary, using something like workqueues to to execute them concurrently. - See rcubarrier.txt for more information. + See rcubarrier.rst for more information. diff --git a/Documentation/RCU/rcu.rst b/Documentation/RCU/rcu.rst index 0e03c6ef3147..3cfe01ba9a49 100644 --- a/Documentation/RCU/rcu.rst +++ b/Documentation/RCU/rcu.rst @@ -10,9 +10,8 @@ A "grace period" must elapse between the two parts, and this grace period must be long enough that any readers accessing the item being deleted have since dropped their references. For example, an RCU-protected deletion from a linked list would first remove the item from the list, wait for -a grace period to elapse, then free the element. See the -:ref:`Documentation/RCU/listRCU.rst <list_rcu_doc>` for more information on -using RCU with linked lists. +a grace period to elapse, then free the element. See listRCU.rst for more +information on using RCU with linked lists. Frequently Asked Questions -------------------------- @@ -50,7 +49,7 @@ Frequently Asked Questions - If I am running on a uniprocessor kernel, which can only do one thing at a time, why should I wait for a grace period? - See :ref:`Documentation/RCU/UP.rst <up_doc>` for more information. + See UP.rst for more information. - How can I see where RCU is currently used in the Linux kernel? @@ -64,13 +63,13 @@ Frequently Asked Questions - What guidelines should I follow when writing code that uses RCU? - See the checklist.txt file in this directory. + See checklist.rst. - Why the name "RCU"? "RCU" stands for "read-copy update". - :ref:`Documentation/RCU/listRCU.rst <list_rcu_doc>` has more information on where - this name came from, search for "read-copy update" to find it. + listRCU.rst has more information on where this name came from, search + for "read-copy update" to find it. - I hear that RCU is patented? What is with that? diff --git a/Documentation/RCU/rculist_nulls.rst b/Documentation/RCU/rculist_nulls.rst index a9fc774bc400..ca4692775ad4 100644 --- a/Documentation/RCU/rculist_nulls.rst +++ b/Documentation/RCU/rculist_nulls.rst @@ -8,7 +8,7 @@ This section describes how to use hlist_nulls to protect read-mostly linked lists and objects using SLAB_TYPESAFE_BY_RCU allocations. -Please read the basics in Documentation/RCU/listRCU.rst +Please read the basics in listRCU.rst. Using 'nulls' ============= diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst index 78404625bad2..794837eb519b 100644 --- a/Documentation/RCU/stallwarn.rst +++ b/Documentation/RCU/stallwarn.rst @@ -162,6 +162,26 @@ CONFIG_RCU_CPU_STALL_TIMEOUT Stall-warning messages may be enabled and disabled completely via /sys/module/rcupdate/parameters/rcu_cpu_stall_suppress. +CONFIG_RCU_EXP_CPU_STALL_TIMEOUT +-------------------------------- + + Same as the CONFIG_RCU_CPU_STALL_TIMEOUT parameter but only for + the expedited grace period. This parameter defines the period + of time that RCU will wait from the beginning of an expedited + grace period until it issues an RCU CPU stall warning. This time + period is normally 20 milliseconds on Android devices. A zero + value causes the CONFIG_RCU_CPU_STALL_TIMEOUT value to be used, + after conversion to milliseconds. + + This configuration parameter may be changed at runtime via the + /sys/module/rcupdate/parameters/rcu_exp_cpu_stall_timeout, however + this parameter is checked only at the beginning of a cycle. If you + are in a current stall cycle, setting it to a new value will change + the timeout for the -next- stall. + + Stall-warning messages may be enabled and disabled completely via + /sys/module/rcupdate/parameters/rcu_cpu_stall_suppress. + RCU_STALL_DELAY_DELTA --------------------- diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index c34d2212eaca..77ea260efd12 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -224,7 +224,7 @@ synchronize_rcu() be delayed. This property results in system resilience in face of denial-of-service attacks. Code using call_rcu() should limit update rate in order to gain this same sort of resilience. See - checklist.txt for some approaches to limiting the update rate. + checklist.rst for some approaches to limiting the update rate. rcu_assign_pointer() ^^^^^^^^^^^^^^^^^^^^ @@ -318,7 +318,7 @@ rcu_dereference() must prohibit. The rcu_dereference_protected() variant takes a lockdep expression to indicate which locks must be acquired by the caller. If the indicated protection is not provided, - a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst + a lockdep splat is emitted. See Design/Requirements/Requirements.rst and the API's code comments for more details and example usage. .. [2] If the list_for_each_entry_rcu() instance might be used by @@ -399,8 +399,7 @@ for specialized uses, but are relatively uncommon. This section shows a simple use of the core RCU API to protect a global pointer to a dynamically allocated structure. More-typical -uses of RCU may be found in :ref:`listRCU.rst <list_rcu_doc>`, -:ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst <NMI_rcu_doc>`. +uses of RCU may be found in listRCU.rst, arrayRCU.rst, and NMI-RCU.rst. :: struct foo { @@ -482,10 +481,9 @@ So, to sum up: RCU read-side critical sections that might be referencing that data item. -See checklist.txt for additional rules to follow when using RCU. -And again, more-typical uses of RCU may be found in :ref:`listRCU.rst -<list_rcu_doc>`, :ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst -<NMI_rcu_doc>`. +See checklist.rst for additional rules to follow when using RCU. +And again, more-typical uses of RCU may be found in listRCU.rst, +arrayRCU.rst, and NMI-RCU.rst. .. _4_whatisRCU: @@ -579,7 +577,7 @@ to avoid having to write your own callback:: kfree_rcu(old_fp, rcu); -Again, see checklist.txt for additional rules governing the use of RCU. +Again, see checklist.rst for additional rules governing the use of RCU. .. _5_whatisRCU: @@ -663,7 +661,7 @@ been able to write-acquire the lock otherwise. The smp_mb__after_spinlock() promotes synchronize_rcu() to a full memory barrier in compliance with the "Memory-Barrier Guarantees" listed in: - Documentation/RCU/Design/Requirements/Requirements.rst + Design/Requirements/Requirements.rst It is possible to nest rcu_read_lock(), since reader-writer locks may be recursively acquired. Note also that rcu_read_lock() is immune diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3f1cc5e317ed..f8d9af5d51e5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4893,6 +4893,18 @@ rcupdate.rcu_cpu_stall_timeout= [KNL] Set timeout for RCU CPU stall warning messages. + The value is in seconds and the maximum allowed + value is 300 seconds. + + rcupdate.rcu_exp_cpu_stall_timeout= [KNL] + Set timeout for expedited RCU CPU stall warning + messages. The value is in milliseconds + and the maximum allowed value is 21000 + milliseconds. Please note that this value is + adjusted to an arch timer tick resolution. + Setting this to zero causes the value from + rcupdate.rcu_cpu_stall_timeout to be used (after + conversion from seconds to milliseconds). rcupdate.rcu_expedited= [KNL] Use expedited grace-period primitives, for @@ -4955,10 +4967,34 @@ number avoids disturbing real-time workloads, but lengthens grace periods. + rcupdate.rcu_task_stall_info= [KNL] + Set initial timeout in jiffies for RCU task stall + informational messages, which give some indication + of the problem for those not patient enough to + wait for ten minutes. Informational messages are + only printed prior to the stall-warning message + for a given grace period. Disable with a value + less than or equal to zero. Defaults to ten + seconds. A change in value does not take effect + until the beginning of the next grace period. + + rcupdate.rcu_task_stall_info_mult= [KNL] + Multiplier for time interval between successive + RCU task stall informational messages for a given + RCU tasks grace period. This value is clamped + to one through ten, inclusive. It defaults to + the value three, so that the first informational + message is printed 10 seconds into the grace + period, the second at 40 seconds, the third at + 160 seconds, and then the stall warning at 600 + seconds would prevent a fourth at 640 seconds. + rcupdate.rcu_task_stall_timeout= [KNL] - Set timeout in jiffies for RCU task stall warning - messages. Disable with a value less than or equal - to zero. + Set timeout in jiffies for RCU task stall + warning messages. Disable with a value less + than or equal to zero. Defaults to ten minutes. + A change in value does not take effect until + the beginning of the next grace period. rcupdate.rcu_self_test= [KNL] Run the RCU early boot self tests @@ -5377,6 +5413,17 @@ smart2= [HW] Format: <io1>[,<io2>[,...,<io8>]] + smp.csd_lock_timeout= [KNL] + Specify the period of time in milliseconds + that smp_call_function() and friends will wait + for a CPU to release the CSD lock. This is + useful when diagnosing bugs involving CPUs + disabling interrupts for extended periods + of time. Defaults to 5,000 milliseconds, and + setting a value of zero disables this feature. + This feature may be more efficiently disabled + using the csdlock_debug- kernel parameter. + smsc-ircc2.nopnp [HW] Don't use PNP to discover SMC devices smsc-ircc2.ircc_cfg= [HW] Device configuration I/O port smsc-ircc2.ircc_sir= [HW] SIR base I/O port @@ -5608,6 +5655,30 @@ off: Disable mitigation and remove performance impact to RDRAND and RDSEED + srcutree.big_cpu_lim [KNL] + Specifies the number of CPUs constituting a + large system, such that srcu_struct structures + should immediately allocate an srcu_node array. + This kernel-boot parameter defaults to 128, + but takes effect only when the low-order four + bits of srcutree.convert_to_big is equal to 3 + (decide at boot). + + srcutree.convert_to_big [KNL] + Specifies under what conditions an SRCU tree + srcu_struct structure will be converted to big + form, that is, with an rcu_node tree: + + 0: Never. + 1: At init_srcu_struct() time. + 2: When rcutorture decides to. + 3: Decide at boot time (default). + 0x1X: Above plus if high contention. + + Either way, the srcu_node tree will be sized based + on the actual runtime number of CPUs (nr_cpu_ids) + instead of the compile-time CONFIG_NR_CPUS. + srcutree.counter_wrap_check [KNL] Specifies how frequently to check for grace-period sequence counter wrap for the @@ -5625,6 +5696,14 @@ expediting. Set to zero to disable automatic expediting. + srcutree.small_contention_lim [KNL] + Specifies the number of update-side contention + events per jiffy will be tolerated before + initiating a conversion of an srcu_struct + structure to big form. Note that the value of + srcutree.convert_to_big must have the 0x10 bit + set for contention-based conversions to occur. + ssbd= [ARM64,HW] Speculative Store Bypass Disable control diff --git a/Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml b/Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml index b1770640f94b..03ebd2665d07 100644 --- a/Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml +++ b/Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Mediatek's Keypad Controller device tree bindings maintainers: - - Fengping Yu <fengping.yu@mediatek.com> + - Mattijs Korpershoek <mkorpershoek@baylibre.com> allOf: - $ref: "/schemas/input/matrix-keymap.yaml#" diff --git a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2600-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2600-pinctrl.yaml index 57b68d6c7c70..3666ac5b6518 100644 --- a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2600-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2600-pinctrl.yaml @@ -33,7 +33,7 @@ patternProperties: $ref: "/schemas/types.yaml#/definitions/string" enum: [ ADC0, ADC1, ADC10, ADC11, ADC12, ADC13, ADC14, ADC15, ADC2, ADC3, ADC4, ADC5, ADC6, ADC7, ADC8, ADC9, BMCINT, EMMC, ESPI, ESPIALT, - FSI1, FSI2, FWSPIABR, FWSPID, FWSPIWP, GPIT0, GPIT1, GPIT2, GPIT3, + FSI1, FSI2, FWQSPI, FWSPIABR, FWSPID, FWSPIWP, GPIT0, GPIT1, GPIT2, GPIT3, GPIT4, GPIT5, GPIT6, GPIT7, GPIU0, GPIU1, GPIU2, GPIU3, GPIU4, GPIU5, GPIU6, GPIU7, I2C1, I2C10, I2C11, I2C12, I2C13, I2C14, I2C15, I2C16, I2C2, I2C3, I2C4, I2C5, I2C6, I2C7, I2C8, I2C9, I3C3, I3C4, I3C5, @@ -58,7 +58,7 @@ patternProperties: $ref: "/schemas/types.yaml#/definitions/string" enum: [ ADC0, ADC1, ADC10, ADC11, ADC12, ADC13, ADC14, ADC15, ADC2, ADC3, ADC4, ADC5, ADC6, ADC7, ADC8, ADC9, BMCINT, EMMCG1, EMMCG4, - EMMCG8, ESPI, ESPIALT, FSI1, FSI2, FWSPIABR, FWSPID, FWQSPID, FWSPIWP, + EMMCG8, ESPI, ESPIALT, FSI1, FSI2, FWQSPI, FWSPIABR, FWSPID, FWSPIWP, GPIT0, GPIT1, GPIT2, GPIT3, GPIT4, GPIT5, GPIT6, GPIT7, GPIU0, GPIU1, GPIU2, GPIU3, GPIU4, GPIU5, GPIU6, GPIU7, HVI3C3, HVI3C4, I2C1, I2C10, I2C11, I2C12, I2C13, I2C14, I2C15, I2C16, I2C2, I2C3, I2C4, I2C5, diff --git a/Documentation/security/index.rst b/Documentation/security/index.rst index 16335de04e8c..6ed8d2fa6f9e 100644 --- a/Documentation/security/index.rst +++ b/Documentation/security/index.rst @@ -17,3 +17,4 @@ Security Documentation tpm/index digsig landlock + secrets/index diff --git a/Documentation/security/secrets/coco.rst b/Documentation/security/secrets/coco.rst new file mode 100644 index 000000000000..262e7abb1b24 --- /dev/null +++ b/Documentation/security/secrets/coco.rst @@ -0,0 +1,103 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================== +Confidential Computing secrets +============================== + +This document describes how Confidential Computing secret injection is handled +from the firmware to the operating system, in the EFI driver and the efi_secret +kernel module. + + +Introduction +============ + +Confidential Computing (coco) hardware such as AMD SEV (Secure Encrypted +Virtualization) allows guest owners to inject secrets into the VMs +memory without the host/hypervisor being able to read them. In SEV, +secret injection is performed early in the VM launch process, before the +guest starts running. + +The efi_secret kernel module allows userspace applications to access these +secrets via securityfs. + + +Secret data flow +================ + +The guest firmware may reserve a designated memory area for secret injection, +and publish its location (base GPA and length) in the EFI configuration table +under a ``LINUX_EFI_COCO_SECRET_AREA_GUID`` entry +(``adf956ad-e98c-484c-ae11-b51c7d336447``). This memory area should be marked +by the firmware as ``EFI_RESERVED_TYPE``, and therefore the kernel should not +be use it for its own purposes. + +During the VM's launch, the virtual machine manager may inject a secret to that +area. In AMD SEV and SEV-ES this is performed using the +``KVM_SEV_LAUNCH_SECRET`` command (see [sev]_). The strucutre of the injected +Guest Owner secret data should be a GUIDed table of secret values; the binary +format is described in ``drivers/virt/coco/efi_secret/efi_secret.c`` under +"Structure of the EFI secret area". + +On kernel start, the kernel's EFI driver saves the location of the secret area +(taken from the EFI configuration table) in the ``efi.coco_secret`` field. +Later it checks if the secret area is populated: it maps the area and checks +whether its content begins with ``EFI_SECRET_TABLE_HEADER_GUID`` +(``1e74f542-71dd-4d66-963e-ef4287ff173b``). If the secret area is populated, +the EFI driver will autoload the efi_secret kernel module, which exposes the +secrets to userspace applications via securityfs. The details of the +efi_secret filesystem interface are in [secrets-coco-abi]_. + + +Application usage example +========================= + +Consider a guest performing computations on encrypted files. The Guest Owner +provides the decryption key (= secret) using the secret injection mechanism. +The guest application reads the secret from the efi_secret filesystem and +proceeds to decrypt the files into memory and then performs the needed +computations on the content. + +In this example, the host can't read the files from the disk image +because they are encrypted. Host can't read the decryption key because +it is passed using the secret injection mechanism (= secure channel). +Host can't read the decrypted content from memory because it's a +confidential (memory-encrypted) guest. + +Here is a simple example for usage of the efi_secret module in a guest +to which an EFI secret area with 4 secrets was injected during launch:: + + # ls -la /sys/kernel/security/secrets/coco + total 0 + drwxr-xr-x 2 root root 0 Jun 28 11:54 . + drwxr-xr-x 3 root root 0 Jun 28 11:54 .. + -r--r----- 1 root root 0 Jun 28 11:54 736870e5-84f0-4973-92ec-06879ce3da0b + -r--r----- 1 root root 0 Jun 28 11:54 83c83f7f-1356-4975-8b7e-d3a0b54312c6 + -r--r----- 1 root root 0 Jun 28 11:54 9553f55d-3da2-43ee-ab5d-ff17f78864d2 + -r--r----- 1 root root 0 Jun 28 11:54 e6f5a162-d67f-4750-a67c-5d065f2a9910 + + # hd /sys/kernel/security/secrets/coco/e6f5a162-d67f-4750-a67c-5d065f2a9910 + 00000000 74 68 65 73 65 2d 61 72 65 2d 74 68 65 2d 6b 61 |these-are-the-ka| + 00000010 74 61 2d 73 65 63 72 65 74 73 00 01 02 03 04 05 |ta-secrets......| + 00000020 06 07 |..| + 00000022 + + # rm /sys/kernel/security/secrets/coco/e6f5a162-d67f-4750-a67c-5d065f2a9910 + + # ls -la /sys/kernel/security/secrets/coco + total 0 + drwxr-xr-x 2 root root 0 Jun 28 11:55 . + drwxr-xr-x 3 root root 0 Jun 28 11:54 .. + -r--r----- 1 root root 0 Jun 28 11:54 736870e5-84f0-4973-92ec-06879ce3da0b + -r--r----- 1 root root 0 Jun 28 11:54 83c83f7f-1356-4975-8b7e-d3a0b54312c6 + -r--r----- 1 root root 0 Jun 28 11:54 9553f55d-3da2-43ee-ab5d-ff17f78864d2 + + +References +========== + +See [sev-api-spec]_ for more info regarding SEV ``LAUNCH_SECRET`` operation. + +.. [sev] Documentation/virt/kvm/amd-memory-encryption.rst +.. [secrets-coco-abi] Documentation/ABI/testing/securityfs-secrets-coco +.. [sev-api-spec] https://www.amd.com/system/files/TechDocs/55766_SEV-KM_API_Specification.pdf diff --git a/Documentation/security/secrets/index.rst b/Documentation/security/secrets/index.rst new file mode 100644 index 000000000000..ced34e9c43bd --- /dev/null +++ b/Documentation/security/secrets/index.rst @@ -0,0 +1,9 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +Secrets documentation +===================== + +.. toctree:: + + coco diff --git a/MAINTAINERS b/MAINTAINERS index d6d879cb0afd..f468864fd268 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14374,7 +14374,6 @@ F: arch/arm/*omap*/*pm* F: drivers/cpufreq/omap-cpufreq.c OMAP POWERDOMAIN SOC ADAPTATION LAYER SUPPORT -M: Rajendra Nayak <rnayak@codeaurora.org> M: Paul Walmsley <paul@pwsan.com> L: linux-omap@vger.kernel.org S: Maintained @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 18 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = NAME = Superb Owl # *DOCUMENTATION* diff --git a/arch/Kconfig b/arch/Kconfig index 31c4fdc4a4ba..513a29c62a9b 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -35,6 +35,7 @@ config KPROBES depends on MODULES depends on HAVE_KPROBES select KALLSYMS + select TASKS_RCU if PREEMPTION help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes diff --git a/arch/arm/boot/dts/aspeed-bmc-asrock-romed8hm3.dts b/arch/arm/boot/dts/aspeed-bmc-asrock-romed8hm3.dts index e71ccfd1df63..ff4c07c69af1 100644 --- a/arch/arm/boot/dts/aspeed-bmc-asrock-romed8hm3.dts +++ b/arch/arm/boot/dts/aspeed-bmc-asrock-romed8hm3.dts @@ -100,12 +100,14 @@ lm25066@40 { compatible = "lm25066"; reg = <0x40>; + shunt-resistor-micro-ohms = <1000>; }; /* 12VSB PMIC */ lm25066@41 { compatible = "lm25066"; reg = <0x41>; + shunt-resistor-micro-ohms = <10000>; }; }; @@ -196,7 +198,7 @@ gpio-line-names = /* A */ "LOCATORLED_STATUS_N", "BMC_MAC2_INTB", "NMI_BTN_N", "BMC_NMI", "", "", "", "", - /* B */ "DDR_MEM_TEMP", "", "", "", "", "", "", "", + /* B */ "POST_COMPLETE_N", "", "", "", "", "", "", "", /* C */ "", "", "", "", "PCIE_HP_SEL_N", "PCIE_SATA_SEL_N", "LOCATORBTN", "", /* D */ "BMC_PSIN", "BMC_PSOUT", "BMC_RESETCON", "RESETCON", "", "", "", "PSU_FAN_FAIL_N", diff --git a/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi b/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi index e4775bbceecc..7cd4f075e325 100644 --- a/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi +++ b/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi @@ -117,9 +117,9 @@ groups = "FWSPID"; }; - pinctrl_fwqspid_default: fwqspid_default { - function = "FWSPID"; - groups = "FWQSPID"; + pinctrl_fwqspi_default: fwqspi_default { + function = "FWQSPI"; + groups = "FWQSPI"; }; pinctrl_fwspiwp_default: fwspiwp_default { @@ -653,12 +653,12 @@ }; pinctrl_qspi1_default: qspi1_default { - function = "QSPI1"; + function = "SPI1"; groups = "QSPI1"; }; pinctrl_qspi2_default: qspi2_default { - function = "QSPI2"; + function = "SPI2"; groups = "QSPI2"; }; diff --git a/arch/arm/boot/dts/aspeed-g6.dtsi b/arch/arm/boot/dts/aspeed-g6.dtsi index 3d5ce9da42c3..9d2a0ce4ca06 100644 --- a/arch/arm/boot/dts/aspeed-g6.dtsi +++ b/arch/arm/boot/dts/aspeed-g6.dtsi @@ -389,6 +389,16 @@ reg = <0x1e6f2000 0x1000>; }; + video: video@1e700000 { + compatible = "aspeed,ast2600-video-engine"; + reg = <0x1e700000 0x1000>; + clocks = <&syscon ASPEED_CLK_GATE_VCLK>, + <&syscon ASPEED_CLK_GATE_ECLK>; + clock-names = "vclk", "eclk"; + interrupts = <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + gpio0: gpio@1e780000 { #gpio-cells = <2>; gpio-controller; diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 06508698abb8..7a8682468a84 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -1145,7 +1145,7 @@ vector_bhb_loop8_\name: @ bhb workaround mov r0, #8 -3: b . + 4 +3: W(b) . + 4 subs r0, r0, #1 bne 3b dsb diff --git a/arch/arm/mm/proc-v7-bugs.c b/arch/arm/mm/proc-v7-bugs.c index 06dbfb968182..fb9f3eb6bf48 100644 --- a/arch/arm/mm/proc-v7-bugs.c +++ b/arch/arm/mm/proc-v7-bugs.c @@ -288,6 +288,7 @@ void cpu_v7_ca15_ibe(void) { if (check_spectre_auxcr(this_cpu_ptr(&spectre_warned), BIT(0))) cpu_v7_spectre_v2_init(); + cpu_v7_spectre_bhb_init(); } void cpu_v7_bugs_init(void) diff --git a/arch/arm64/boot/dts/qcom/sm8250-mtp.dts b/arch/arm64/boot/dts/qcom/sm8250-mtp.dts index fb99cc2827c7..7ab3627cc347 100644 --- a/arch/arm64/boot/dts/qcom/sm8250-mtp.dts +++ b/arch/arm64/boot/dts/qcom/sm8250-mtp.dts @@ -622,6 +622,10 @@ status = "okay"; }; +&rxmacro { + status = "okay"; +}; + &slpi { status = "okay"; firmware-name = "qcom/sm8250/slpi.mbn"; @@ -773,6 +777,8 @@ }; &swr1 { + status = "okay"; + wcd_rx: wcd9380-rx@0,4 { compatible = "sdw20217010d00"; reg = <0 4>; @@ -781,6 +787,8 @@ }; &swr2 { + status = "okay"; + wcd_tx: wcd9380-tx@0,3 { compatible = "sdw20217010d00"; reg = <0 3>; @@ -819,6 +827,10 @@ }; }; +&txmacro { + status = "okay"; +}; + &uart12 { status = "okay"; }; diff --git a/arch/arm64/boot/dts/qcom/sm8250.dtsi b/arch/arm64/boot/dts/qcom/sm8250.dtsi index af8f22636436..1304b86af1a0 100644 --- a/arch/arm64/boot/dts/qcom/sm8250.dtsi +++ b/arch/arm64/boot/dts/qcom/sm8250.dtsi @@ -2255,6 +2255,7 @@ pinctrl-0 = <&rx_swr_active>; compatible = "qcom,sm8250-lpass-rx-macro"; reg = <0 0x3200000 0 0x1000>; + status = "disabled"; clocks = <&q6afecc LPASS_CLK_ID_TX_CORE_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>, <&q6afecc LPASS_CLK_ID_TX_CORE_NPL_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>, @@ -2273,6 +2274,7 @@ swr1: soundwire-controller@3210000 { reg = <0 0x3210000 0 0x2000>; compatible = "qcom,soundwire-v1.5.1"; + status = "disabled"; interrupts = <GIC_SPI 298 IRQ_TYPE_LEVEL_HIGH>; clocks = <&rxmacro>; clock-names = "iface"; @@ -2300,6 +2302,7 @@ pinctrl-0 = <&tx_swr_active>; compatible = "qcom,sm8250-lpass-tx-macro"; reg = <0 0x3220000 0 0x1000>; + status = "disabled"; clocks = <&q6afecc LPASS_CLK_ID_TX_CORE_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>, <&q6afecc LPASS_CLK_ID_TX_CORE_NPL_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>, @@ -2323,6 +2326,7 @@ compatible = "qcom,soundwire-v1.5.1"; interrupts-extended = <&intc GIC_SPI 297 IRQ_TYPE_LEVEL_HIGH>; interrupt-names = "core"; + status = "disabled"; clocks = <&txmacro>; clock-names = "iface"; diff --git a/arch/arm64/boot/dts/rockchip/rk3568-bpi-r2-pro.dts b/arch/arm64/boot/dts/rockchip/rk3568-bpi-r2-pro.dts index a01886b467ed..067fe4a6b178 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568-bpi-r2-pro.dts +++ b/arch/arm64/boot/dts/rockchip/rk3568-bpi-r2-pro.dts @@ -16,6 +16,7 @@ aliases { ethernet0 = &gmac0; + ethernet1 = &gmac1; mmc0 = &sdmmc0; mmc1 = &sdhci; }; @@ -78,7 +79,6 @@ assigned-clocks = <&cru SCLK_GMAC0_RX_TX>, <&cru SCLK_GMAC0>; assigned-clock-parents = <&cru SCLK_GMAC0_RGMII_SPEED>, <&cru CLK_MAC0_2TOP>; clock_in_out = "input"; - phy-handle = <&rgmii_phy0>; phy-mode = "rgmii"; pinctrl-names = "default"; pinctrl-0 = <&gmac0_miim @@ -90,8 +90,38 @@ snps,reset-active-low; /* Reset time is 20ms, 100ms for rtl8211f */ snps,reset-delays-us = <0 20000 100000>; + tx_delay = <0x4f>; + rx_delay = <0x0f>; + status = "okay"; + + fixed-link { + speed = <1000>; + full-duplex; + pause; + }; +}; + +&gmac1 { + assigned-clocks = <&cru SCLK_GMAC1_RX_TX>, <&cru SCLK_GMAC1>; + assigned-clock-parents = <&cru SCLK_GMAC1_RGMII_SPEED>, <&cru CLK_MAC1_2TOP>; + clock_in_out = "output"; + phy-handle = <&rgmii_phy1>; + phy-mode = "rgmii"; + pinctrl-names = "default"; + pinctrl-0 = <&gmac1m1_miim + &gmac1m1_tx_bus2 + &gmac1m1_rx_bus2 + &gmac1m1_rgmii_clk + &gmac1m1_rgmii_bus>; + + snps,reset-gpio = <&gpio3 RK_PB0 GPIO_ACTIVE_LOW>; + snps,reset-active-low; + /* Reset time is 20ms, 100ms for rtl8211f */ + snps,reset-delays-us = <0 20000 100000>; + tx_delay = <0x3c>; rx_delay = <0x2f>; + status = "okay"; }; @@ -315,8 +345,8 @@ status = "disabled"; }; -&mdio0 { - rgmii_phy0: ethernet-phy@0 { +&mdio1 { + rgmii_phy1: ethernet-phy@0 { compatible = "ethernet-phy-ieee802.3-c22"; reg = <0x0>; }; @@ -345,9 +375,9 @@ pmuio2-supply = <&vcc3v3_pmu>; vccio1-supply = <&vccio_acodec>; vccio3-supply = <&vccio_sd>; - vccio4-supply = <&vcc_1v8>; + vccio4-supply = <&vcc_3v3>; vccio5-supply = <&vcc_3v3>; - vccio6-supply = <&vcc_3v3>; + vccio6-supply = <&vcc_1v8>; vccio7-supply = <&vcc_3v3>; status = "okay"; }; diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 78b3e0f8e997..d502703e8373 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -76,6 +76,9 @@ void mte_sync_tags(pte_t old_pte, pte_t pte) mte_sync_page_tags(page, old_pte, check_swap, pte_is_tagged); } + + /* ensure the tags are visible before the PTE is set */ + smp_wmb(); } int memcmp_pages(struct page *page1, struct page *page2) diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c index 75fed4460407..57c7c211f8c7 100644 --- a/arch/arm64/kernel/paravirt.c +++ b/arch/arm64/kernel/paravirt.c @@ -35,7 +35,7 @@ static u64 native_steal_clock(int cpu) DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); struct pv_time_stolen_time_region { - struct pvclock_vcpu_stolen_time *kaddr; + struct pvclock_vcpu_stolen_time __rcu *kaddr; }; static DEFINE_PER_CPU(struct pv_time_stolen_time_region, stolen_time_region); @@ -52,7 +52,9 @@ early_param("no-steal-acc", parse_no_stealacc); /* return stolen time in ns by asking the hypervisor */ static u64 para_steal_clock(int cpu) { + struct pvclock_vcpu_stolen_time *kaddr = NULL; struct pv_time_stolen_time_region *reg; + u64 ret = 0; reg = per_cpu_ptr(&stolen_time_region, cpu); @@ -61,28 +63,37 @@ static u64 para_steal_clock(int cpu) * online notification callback runs. Until the callback * has run we just return zero. */ - if (!reg->kaddr) + rcu_read_lock(); + kaddr = rcu_dereference(reg->kaddr); + if (!kaddr) { + rcu_read_unlock(); return 0; + } - return le64_to_cpu(READ_ONCE(reg->kaddr->stolen_time)); + ret = le64_to_cpu(READ_ONCE(kaddr->stolen_time)); + rcu_read_unlock(); + return ret; } static int stolen_time_cpu_down_prepare(unsigned int cpu) { + struct pvclock_vcpu_stolen_time *kaddr = NULL; struct pv_time_stolen_time_region *reg; reg = this_cpu_ptr(&stolen_time_region); if (!reg->kaddr) return 0; - memunmap(reg->kaddr); - memset(reg, 0, sizeof(*reg)); + kaddr = rcu_replace_pointer(reg->kaddr, NULL, true); + synchronize_rcu(); + memunmap(kaddr); return 0; } static int stolen_time_cpu_online(unsigned int cpu) { + struct pvclock_vcpu_stolen_time *kaddr = NULL; struct pv_time_stolen_time_region *reg; struct arm_smccc_res res; @@ -93,17 +104,19 @@ static int stolen_time_cpu_online(unsigned int cpu) if (res.a0 == SMCCC_RET_NOT_SUPPORTED) return -EINVAL; - reg->kaddr = memremap(res.a0, + kaddr = memremap(res.a0, sizeof(struct pvclock_vcpu_stolen_time), MEMREMAP_WB); + rcu_assign_pointer(reg->kaddr, kaddr); + if (!reg->kaddr) { pr_warn("Failed to map stolen time data structure\n"); return -ENOMEM; } - if (le32_to_cpu(reg->kaddr->revision) != 0 || - le32_to_cpu(reg->kaddr->attributes) != 0) { + if (le32_to_cpu(kaddr->revision) != 0 || + le32_to_cpu(kaddr->attributes) != 0) { pr_warn_once("Unexpected revision or attributes in stolen time data\n"); return -ENXIO; } diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index f0a3df9e18a3..413f899e4ac6 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -37,6 +37,15 @@ * safe memory that has been set up to be preserved during the copy operation. */ SYM_CODE_START(arm64_relocate_new_kernel) + /* + * The kimage structure isn't allocated specially and may be clobbered + * during relocation. We must load any values we need from it prior to + * any relocation occurring. + */ + ldr x28, [x0, #KIMAGE_START] + ldr x27, [x0, #KIMAGE_ARCH_EL2_VECTORS] + ldr x26, [x0, #KIMAGE_ARCH_DTB_MEM] + /* Setup the list loop variables. */ ldr x18, [x0, #KIMAGE_ARCH_ZERO_PAGE] /* x18 = zero page for BBM */ ldr x17, [x0, #KIMAGE_ARCH_TTBR1] /* x17 = linear map copy */ @@ -72,21 +81,20 @@ SYM_CODE_START(arm64_relocate_new_kernel) ic iallu dsb nsh isb - ldr x4, [x0, #KIMAGE_START] /* relocation start */ - ldr x1, [x0, #KIMAGE_ARCH_EL2_VECTORS] /* relocation start */ - ldr x0, [x0, #KIMAGE_ARCH_DTB_MEM] /* dtb address */ turn_off_mmu x12, x13 /* Start new image. */ - cbz x1, .Lel1 - mov x1, x4 /* relocation start */ - mov x2, x0 /* dtb address */ + cbz x27, .Lel1 + mov x1, x28 /* kernel entry point */ + mov x2, x26 /* dtb address */ mov x3, xzr mov x4, xzr mov x0, #HVC_SOFT_RESTART hvc #0 /* Jumps from el2 */ .Lel1: + mov x0, x26 /* dtb address */ + mov x1, xzr mov x2, xzr mov x3, xzr - br x4 /* Jumps from el1 */ + br x28 /* Jumps from el1 */ SYM_CODE_END(arm64_relocate_new_kernel) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 523bc934fe2f..a66d83540c15 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1436,7 +1436,8 @@ static int kvm_init_vector_slots(void) base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs)); kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT); - if (kvm_system_needs_idmapped_vectors() && !has_vhe()) { + if (kvm_system_needs_idmapped_vectors() && + !is_protected_kvm_enabled()) { err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs), __BP_HARDEN_HYP_VECS_SZ, &base); if (err) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 7b45c040cc27..adf408c09cdb 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1123,8 +1123,7 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3); val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); - if (irqchip_in_kernel(vcpu->kvm) && - vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + if (kvm_vgic_global_state.type == VGIC_V3) { val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_GIC); val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_GIC), 1); } diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index e8b4a03343d3..8d03b3b26229 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -59,20 +59,12 @@ void flush_dcache_page(struct page *page); flush_kernel_icache_range_asm(s,e); \ } while (0) -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ -do { \ - flush_cache_page(vma, vaddr, page_to_pfn(page)); \ - memcpy(dst, src, len); \ - flush_kernel_dcache_range_asm((unsigned long)dst, (unsigned long)dst + len); \ -} while (0) - -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ -do { \ - flush_cache_page(vma, vaddr, page_to_pfn(page)); \ - memcpy(dst, src, len); \ -} while (0) - -void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long pfn); +void copy_to_user_page(struct vm_area_struct *vma, struct page *page, + unsigned long user_vaddr, void *dst, void *src, int len); +void copy_from_user_page(struct vm_area_struct *vma, struct page *page, + unsigned long user_vaddr, void *dst, void *src, int len); +void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, + unsigned long pfn); void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); @@ -80,16 +72,7 @@ void flush_cache_range(struct vm_area_struct *vma, void flush_dcache_page_asm(unsigned long phys_addr, unsigned long vaddr); #define ARCH_HAS_FLUSH_ANON_PAGE -static inline void -flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) -{ - if (PageAnon(page)) { - flush_tlb_page(vma, vmaddr); - preempt_disable(); - flush_dcache_page_asm(page_to_phys(page), vmaddr); - preempt_enable(); - } -} +void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr); #define ARCH_HAS_FLUSH_ON_KUNMAP static inline void kunmap_flush_on_unmap(void *addr) diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h index 0561568f7b48..6faaaa3ebe9b 100644 --- a/arch/parisc/include/asm/page.h +++ b/arch/parisc/include/asm/page.h @@ -26,12 +26,14 @@ #define copy_page(to, from) copy_page_asm((void *)(to), (void *)(from)) struct page; +struct vm_area_struct; void clear_page_asm(void *page); void copy_page_asm(void *to, void *from); #define clear_user_page(vto, vaddr, page) clear_page_asm(vto) -void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, - struct page *pg); +void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, + struct vm_area_struct *vma); +#define __HAVE_ARCH_COPY_USER_HIGHPAGE /* * These are used to make use of C type-checking.. diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index e7911225a4f8..0fd04073d4b6 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -27,6 +27,7 @@ #include <asm/processor.h> #include <asm/sections.h> #include <asm/shmparam.h> +#include <asm/mmu_context.h> int split_tlb __ro_after_init; int dcache_stride __ro_after_init; @@ -91,7 +92,7 @@ static inline void flush_data_cache(void) } -/* Virtual address of pfn. */ +/* Kernel virtual address of pfn. */ #define pfn_va(pfn) __va(PFN_PHYS(pfn)) void @@ -124,11 +125,13 @@ show_cache_info(struct seq_file *m) cache_info.ic_size/1024 ); if (cache_info.dc_loop != 1) snprintf(buf, 32, "%lu-way associative", cache_info.dc_loop); - seq_printf(m, "D-cache\t\t: %ld KB (%s%s, %s)\n", + seq_printf(m, "D-cache\t\t: %ld KB (%s%s, %s, alias=%d)\n", cache_info.dc_size/1024, (cache_info.dc_conf.cc_wt ? "WT":"WB"), (cache_info.dc_conf.cc_sh ? ", shared I/D":""), - ((cache_info.dc_loop == 1) ? "direct mapped" : buf)); + ((cache_info.dc_loop == 1) ? "direct mapped" : buf), + cache_info.dc_conf.cc_alias + ); seq_printf(m, "ITLB entries\t: %ld\n" "DTLB entries\t: %ld%s\n", cache_info.it_size, cache_info.dt_size, @@ -324,25 +327,81 @@ __flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, preempt_enable(); } -static inline void -__purge_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, - unsigned long physaddr) +static void flush_user_cache_page(struct vm_area_struct *vma, unsigned long vmaddr) { - if (!static_branch_likely(&parisc_has_cache)) - return; + unsigned long flags, space, pgd, prot; +#ifdef CONFIG_TLB_PTLOCK + unsigned long pgd_lock; +#endif + + vmaddr &= PAGE_MASK; + preempt_disable(); - purge_dcache_page_asm(physaddr, vmaddr); + + /* Set context for flush */ + local_irq_save(flags); + prot = mfctl(8); + space = mfsp(SR_USER); + pgd = mfctl(25); +#ifdef CONFIG_TLB_PTLOCK + pgd_lock = mfctl(28); +#endif + switch_mm_irqs_off(NULL, vma->vm_mm, NULL); + local_irq_restore(flags); + + flush_user_dcache_range_asm(vmaddr, vmaddr + PAGE_SIZE); if (vma->vm_flags & VM_EXEC) - flush_icache_page_asm(physaddr, vmaddr); + flush_user_icache_range_asm(vmaddr, vmaddr + PAGE_SIZE); + flush_tlb_page(vma, vmaddr); + + /* Restore previous context */ + local_irq_save(flags); +#ifdef CONFIG_TLB_PTLOCK + mtctl(pgd_lock, 28); +#endif + mtctl(pgd, 25); + mtsp(space, SR_USER); + mtctl(prot, 8); + local_irq_restore(flags); + preempt_enable(); } +static inline pte_t *get_ptep(struct mm_struct *mm, unsigned long addr) +{ + pte_t *ptep = NULL; + pgd_t *pgd = mm->pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + if (!pgd_none(*pgd)) { + p4d = p4d_offset(pgd, addr); + if (!p4d_none(*p4d)) { + pud = pud_offset(p4d, addr); + if (!pud_none(*pud)) { + pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd)) + ptep = pte_offset_map(pmd, addr); + } + } + } + return ptep; +} + +static inline bool pte_needs_flush(pte_t pte) +{ + return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_NO_CACHE)) + == (_PAGE_PRESENT | _PAGE_ACCESSED); +} + void flush_dcache_page(struct page *page) { struct address_space *mapping = page_mapping_file(page); struct vm_area_struct *mpnt; unsigned long offset; unsigned long addr, old_addr = 0; + unsigned long count = 0; pgoff_t pgoff; if (mapping && !mapping_mapped(mapping)) { @@ -357,33 +416,52 @@ void flush_dcache_page(struct page *page) pgoff = page->index; - /* We have carefully arranged in arch_get_unmapped_area() that + /* + * We have carefully arranged in arch_get_unmapped_area() that * *any* mappings of a file are always congruently mapped (whether * declared as MAP_PRIVATE or MAP_SHARED), so we only need - * to flush one address here for them all to become coherent */ - + * to flush one address here for them all to become coherent + * on machines that support equivalent aliasing + */ flush_dcache_mmap_lock(mapping); vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; addr = mpnt->vm_start + offset; + if (parisc_requires_coherency()) { + pte_t *ptep; - /* The TLB is the engine of coherence on parisc: The - * CPU is entitled to speculate any page with a TLB - * mapping, so here we kill the mapping then flush the - * page along a special flush only alias mapping. - * This guarantees that the page is no-longer in the - * cache for any process and nor may it be - * speculatively read in (until the user or kernel - * specifically accesses it, of course) */ - - flush_tlb_page(mpnt, addr); - if (old_addr == 0 || (old_addr & (SHM_COLOUR - 1)) - != (addr & (SHM_COLOUR - 1))) { - __flush_cache_page(mpnt, addr, page_to_phys(page)); - if (parisc_requires_coherency() && old_addr) - printk(KERN_ERR "INEQUIVALENT ALIASES 0x%lx and 0x%lx in file %pD\n", old_addr, addr, mpnt->vm_file); - old_addr = addr; + ptep = get_ptep(mpnt->vm_mm, addr); + if (ptep && pte_needs_flush(*ptep)) + flush_user_cache_page(mpnt, addr); + } else { + /* + * The TLB is the engine of coherence on parisc: + * The CPU is entitled to speculate any page + * with a TLB mapping, so here we kill the + * mapping then flush the page along a special + * flush only alias mapping. This guarantees that + * the page is no-longer in the cache for any + * process and nor may it be speculatively read + * in (until the user or kernel specifically + * accesses it, of course) + */ + flush_tlb_page(mpnt, addr); + if (old_addr == 0 || (old_addr & (SHM_COLOUR - 1)) + != (addr & (SHM_COLOUR - 1))) { + __flush_cache_page(mpnt, addr, page_to_phys(page)); + /* + * Software is allowed to have any number + * of private mappings to a page. + */ + if (!(mpnt->vm_flags & VM_SHARED)) + continue; + if (old_addr) + pr_err("INEQUIVALENT ALIASES 0x%lx and 0x%lx in file %pD\n", + old_addr, addr, mpnt->vm_file); + old_addr = addr; + } } + WARN_ON(++count == 4096); } flush_dcache_mmap_unlock(mapping); } @@ -403,7 +481,7 @@ void __init parisc_setup_cache_timing(void) { unsigned long rangetime, alltime; unsigned long size; - unsigned long threshold; + unsigned long threshold, threshold2; alltime = mfctl(16); flush_data_cache(); @@ -417,11 +495,16 @@ void __init parisc_setup_cache_timing(void) printk(KERN_DEBUG "Whole cache flush %lu cycles, flushing %lu bytes %lu cycles\n", alltime, size, rangetime); - threshold = L1_CACHE_ALIGN(size * alltime / rangetime); - if (threshold > cache_info.dc_size) - threshold = cache_info.dc_size; - if (threshold) - parisc_cache_flush_threshold = threshold; + threshold = L1_CACHE_ALIGN((unsigned long)((uint64_t)size * alltime / rangetime)); + pr_info("Calculated flush threshold is %lu KiB\n", + threshold/1024); + + /* + * The threshold computed above isn't very reliable. The following + * heuristic works reasonably well on c8000/rp3440. + */ + threshold2 = cache_info.dc_size * num_online_cpus(); + parisc_cache_flush_threshold = threshold2; printk(KERN_INFO "Cache flush threshold set to %lu KiB\n", parisc_cache_flush_threshold/1024); @@ -477,19 +560,47 @@ void flush_kernel_dcache_page_addr(void *addr) } EXPORT_SYMBOL(flush_kernel_dcache_page_addr); -void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, - struct page *pg) +static void flush_cache_page_if_present(struct vm_area_struct *vma, + unsigned long vmaddr, unsigned long pfn) { - /* Copy using kernel mapping. No coherency is needed (all in - kunmap) for the `to' page. However, the `from' page needs to - be flushed through a mapping equivalent to the user mapping - before it can be accessed through the kernel mapping. */ - preempt_disable(); - flush_dcache_page_asm(__pa(vfrom), vaddr); - copy_page_asm(vto, vfrom); - preempt_enable(); + pte_t *ptep = get_ptep(vma->vm_mm, vmaddr); + + /* + * The pte check is racy and sometimes the flush will trigger + * a non-access TLB miss. Hopefully, the page has already been + * flushed. + */ + if (ptep && pte_needs_flush(*ptep)) + flush_cache_page(vma, vmaddr, pfn); +} + +void copy_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + void *kto, *kfrom; + + kfrom = kmap_local_page(from); + kto = kmap_local_page(to); + flush_cache_page_if_present(vma, vaddr, page_to_pfn(from)); + copy_page_asm(kto, kfrom); + kunmap_local(kto); + kunmap_local(kfrom); +} + +void copy_to_user_page(struct vm_area_struct *vma, struct page *page, + unsigned long user_vaddr, void *dst, void *src, int len) +{ + flush_cache_page_if_present(vma, user_vaddr, page_to_pfn(page)); + memcpy(dst, src, len); + flush_kernel_dcache_range_asm((unsigned long)dst, (unsigned long)dst + len); +} + +void copy_from_user_page(struct vm_area_struct *vma, struct page *page, + unsigned long user_vaddr, void *dst, void *src, int len) +{ + flush_cache_page_if_present(vma, user_vaddr, page_to_pfn(page)); + memcpy(dst, src, len); } -EXPORT_SYMBOL(copy_user_page); /* __flush_tlb_range() * @@ -520,92 +631,105 @@ int __flush_tlb_range(unsigned long sid, unsigned long start, return 0; } -static inline unsigned long mm_total_size(struct mm_struct *mm) +static void flush_cache_pages(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - struct vm_area_struct *vma; - unsigned long usize = 0; - - for (vma = mm->mmap; vma; vma = vma->vm_next) - usize += vma->vm_end - vma->vm_start; - return usize; -} - -static inline pte_t *get_ptep(pgd_t *pgd, unsigned long addr) -{ - pte_t *ptep = NULL; + unsigned long addr, pfn; + pte_t *ptep; - if (!pgd_none(*pgd)) { - p4d_t *p4d = p4d_offset(pgd, addr); - if (!p4d_none(*p4d)) { - pud_t *pud = pud_offset(p4d, addr); - if (!pud_none(*pud)) { - pmd_t *pmd = pmd_offset(pud, addr); - if (!pmd_none(*pmd)) - ptep = pte_offset_map(pmd, addr); + for (addr = start; addr < end; addr += PAGE_SIZE) { + /* + * The vma can contain pages that aren't present. Although + * the pte search is expensive, we need the pte to find the + * page pfn and to check whether the page should be flushed. + */ + ptep = get_ptep(vma->vm_mm, addr); + if (ptep && pte_needs_flush(*ptep)) { + if (parisc_requires_coherency()) { + flush_user_cache_page(vma, addr); + } else { + pfn = pte_pfn(*ptep); + if (WARN_ON(!pfn_valid(pfn))) + return; + __flush_cache_page(vma, addr, PFN_PHYS(pfn)); } } } - return ptep; } -static void flush_cache_pages(struct vm_area_struct *vma, struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline unsigned long mm_total_size(struct mm_struct *mm) { - unsigned long addr, pfn; - pte_t *ptep; + struct vm_area_struct *vma; + unsigned long usize = 0; - for (addr = start; addr < end; addr += PAGE_SIZE) { - ptep = get_ptep(mm->pgd, addr); - if (ptep) { - pfn = pte_pfn(*ptep); - flush_cache_page(vma, addr, pfn); - } - } + for (vma = mm->mmap; vma && usize < parisc_cache_flush_threshold; vma = vma->vm_next) + usize += vma->vm_end - vma->vm_start; + return usize; } void flush_cache_mm(struct mm_struct *mm) { struct vm_area_struct *vma; - /* Flushing the whole cache on each cpu takes forever on - rp3440, etc. So, avoid it if the mm isn't too big. */ - if ((!IS_ENABLED(CONFIG_SMP) || !arch_irqs_disabled()) && - mm_total_size(mm) >= parisc_cache_flush_threshold) { - if (mm->context.space_id) - flush_tlb_all(); + /* + * Flushing the whole cache on each cpu takes forever on + * rp3440, etc. So, avoid it if the mm isn't too big. + * + * Note that we must flush the entire cache on machines + * with aliasing caches to prevent random segmentation + * faults. + */ + if (!parisc_requires_coherency() + || mm_total_size(mm) >= parisc_cache_flush_threshold) { + if (WARN_ON(IS_ENABLED(CONFIG_SMP) && arch_irqs_disabled())) + return; + flush_tlb_all(); flush_cache_all(); return; } + /* Flush mm */ for (vma = mm->mmap; vma; vma = vma->vm_next) - flush_cache_pages(vma, mm, vma->vm_start, vma->vm_end); + flush_cache_pages(vma, vma->vm_start, vma->vm_end); } -void flush_cache_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - if ((!IS_ENABLED(CONFIG_SMP) || !arch_irqs_disabled()) && - end - start >= parisc_cache_flush_threshold) { - if (vma->vm_mm->context.space_id) - flush_tlb_range(vma, start, end); + if (!parisc_requires_coherency() + || end - start >= parisc_cache_flush_threshold) { + if (WARN_ON(IS_ENABLED(CONFIG_SMP) && arch_irqs_disabled())) + return; + flush_tlb_range(vma, start, end); flush_cache_all(); return; } - flush_cache_pages(vma, vma->vm_mm, start, end); + flush_cache_pages(vma, start, end); } -void -flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long pfn) +void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long pfn) { - if (pfn_valid(pfn)) { - if (likely(vma->vm_mm->context.space_id)) { - flush_tlb_page(vma, vmaddr); - __flush_cache_page(vma, vmaddr, PFN_PHYS(pfn)); - } else { - __purge_cache_page(vma, vmaddr, PFN_PHYS(pfn)); - } + if (WARN_ON(!pfn_valid(pfn))) + return; + if (parisc_requires_coherency()) + flush_user_cache_page(vma, vmaddr); + else + __flush_cache_page(vma, vmaddr, PFN_PHYS(pfn)); +} + +void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) +{ + if (!PageAnon(page)) + return; + + if (parisc_requires_coherency()) { + flush_user_cache_page(vma, vmaddr); + return; } + + flush_tlb_page(vma, vmaddr); + preempt_disable(); + flush_dcache_page_asm(page_to_phys(page), vmaddr); + preempt_enable(); } void flush_kernel_vmap_range(void *vaddr, int size) diff --git a/arch/parisc/kernel/patch.c b/arch/parisc/kernel/patch.c index 80a0ab372802..e59574f65e64 100644 --- a/arch/parisc/kernel/patch.c +++ b/arch/parisc/kernel/patch.c @@ -40,10 +40,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags, *need_unmap = 1; set_fixmap(fixmap, page_to_phys(page)); - if (flags) - raw_spin_lock_irqsave(&patch_lock, *flags); - else - __acquire(&patch_lock); + raw_spin_lock_irqsave(&patch_lock, *flags); return (void *) (__fix_to_virt(fixmap) + (uintaddr & ~PAGE_MASK)); } @@ -52,10 +49,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags) { clear_fixmap(fixmap); - if (flags) - raw_spin_unlock_irqrestore(&patch_lock, *flags); - else - __release(&patch_lock); + raw_spin_unlock_irqrestore(&patch_lock, *flags); } void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) @@ -67,8 +61,9 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) int mapped; /* Make sure we don't have any aliases in cache */ - flush_kernel_vmap_range(addr, len); - flush_icache_range(start, end); + flush_kernel_dcache_range_asm(start, end); + flush_kernel_icache_range_asm(start, end); + flush_tlb_kernel_range(start, end); p = fixmap = patch_map(addr, FIX_TEXT_POKE0, &flags, &mapped); @@ -81,8 +76,10 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) * We're crossing a page boundary, so * need to remap */ - flush_kernel_vmap_range((void *)fixmap, - (p-fixmap) * sizeof(*p)); + flush_kernel_dcache_range_asm((unsigned long)fixmap, + (unsigned long)p); + flush_tlb_kernel_range((unsigned long)fixmap, + (unsigned long)p); if (mapped) patch_unmap(FIX_TEXT_POKE0, &flags); p = fixmap = patch_map(addr, FIX_TEXT_POKE0, &flags, @@ -90,10 +87,10 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) } } - flush_kernel_vmap_range((void *)fixmap, (p-fixmap) * sizeof(*p)); + flush_kernel_dcache_range_asm((unsigned long)fixmap, (unsigned long)p); + flush_tlb_kernel_range((unsigned long)fixmap, (unsigned long)p); if (mapped) patch_unmap(FIX_TEXT_POKE0, &flags); - flush_icache_range(start, end); } void __kprobes __patch_text(void *addr, u32 insn) diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index f114e102aaf2..84bc437be5cd 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -22,6 +22,8 @@ #include <asm/traps.h> +#define DEBUG_NATLB 0 + /* Various important other fields */ #define bit22set(x) (x & 0x00000200) #define bits23_25set(x) (x & 0x000001c0) @@ -450,8 +452,8 @@ handle_nadtlb_fault(struct pt_regs *regs) fallthrough; case 0x380: /* PDC and FIC instructions */ - if (printk_ratelimit()) { - pr_warn("BUG: nullifying cache flush/purge instruction\n"); + if (DEBUG_NATLB && printk_ratelimit()) { + pr_warn("WARNING: nullifying cache flush/purge instruction\n"); show_regs(regs); } if (insn & 0x20) { diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi index 746c4d4e7686..cf2f55e1dcb6 100644 --- a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi +++ b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi @@ -366,7 +366,7 @@ gpio1: gpio@20121000 { compatible = "microchip,mpfs-gpio"; - reg = <000 0x20121000 0x0 0x1000>; + reg = <0x0 0x20121000 0x0 0x1000>; interrupt-parent = <&plic>; interrupt-controller; #interrupt-cells = <1>; diff --git a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi index aad45d7f498f..5c638fd5b35c 100644 --- a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi +++ b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi @@ -167,7 +167,7 @@ clocks = <&prci FU540_PRCI_CLK_TLCLK>; status = "disabled"; }; - dma: dma@3000000 { + dma: dma-controller@3000000 { compatible = "sifive,fu540-c000-pdma"; reg = <0x0 0x3000000 0x0 0x8000>; interrupt-parent = <&plic0>; diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 6dbd7e9f74c9..0352e4589efa 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -163,7 +163,11 @@ extra_header_fields: .long 0x200 # SizeOfHeaders .long 0 # CheckSum .word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application) +#ifdef CONFIG_DXE_MEM_ATTRIBUTES + .word IMAGE_DLL_CHARACTERISTICS_NX_COMPAT # DllCharacteristics +#else .word 0 # DllCharacteristics +#endif #ifdef CONFIG_X86_32 .long 0 # SizeOfStackReserve .long 0 # SizeOfStackCommit diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 98938a68251c..bed74a0f2932 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -357,6 +357,11 @@ static inline u32 efi64_convert_status(efi_status_t status) runtime), \ func, __VA_ARGS__)) +#define efi_dxe_call(func, ...) \ + (efi_is_native() \ + ? efi_dxe_table->func(__VA_ARGS__) \ + : __efi64_thunk_map(efi_dxe_table, func, __VA_ARGS__)) + #else /* CONFIG_EFI_MIXED */ static inline bool efi_is_64bit(void) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 46f9dfb60469..a0702b6be3e8 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1914,7 +1914,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); - unsigned long valid_bank_mask; + u64 valid_bank_mask; u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; u32 vector; bool all_cpus; @@ -1956,7 +1956,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; - if (hc->var_cnt != bitmap_weight(&valid_bank_mask, 64)) + if (hc->var_cnt != bitmap_weight((unsigned long *)&valid_bank_mask, 64)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (all_cpus) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 311e4e1d7870..45e1573f8f1d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5470,14 +5470,16 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) uint i; if (pcid == kvm_get_active_pcid(vcpu)) { - mmu->invlpg(vcpu, gva, mmu->root.hpa); + if (mmu->invlpg) + mmu->invlpg(vcpu, gva, mmu->root.hpa); tlb_flush = true; } for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (VALID_PAGE(mmu->prev_roots[i].hpa) && pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) { - mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); + if (mmu->invlpg) + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); tlb_flush = true; } } @@ -5665,6 +5667,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; int nr_zapped, batch = 0; + bool unstable; restart: list_for_each_entry_safe_reverse(sp, node, @@ -5696,11 +5699,12 @@ restart: goto restart; } - if (__kvm_mmu_prepare_zap_page(kvm, sp, - &kvm->arch.zapped_obsolete_pages, &nr_zapped)) { - batch += nr_zapped; + unstable = __kvm_mmu_prepare_zap_page(kvm, sp, + &kvm->arch.zapped_obsolete_pages, &nr_zapped); + batch += nr_zapped; + + if (unstable) goto restart; - } } /* diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index eca39f56c231..0604bc29f0b8 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -171,9 +171,12 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) return true; } -static int cmp_u64(const void *a, const void *b) +static int cmp_u64(const void *pa, const void *pb) { - return *(__u64 *)a - *(__u64 *)b; + u64 a = *(u64 *)pa; + u64 b = *(u64 *)pb; + + return (a > b) - (a < b); } void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 147c30a81f15..1591d67e0bcd 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -93,6 +93,9 @@ static const unsigned long * const efi_tables[] = { #ifdef CONFIG_LOAD_UEFI_KEYS &efi.mokvar_table, #endif +#ifdef CONFIG_EFI_COCO_SECRET + &efi.coco_secret, +#endif }; u64 efi_setup; /* efi setup_data physical address */ diff --git a/block/blk-mq.c b/block/blk-mq.c index 84d749511f55..ed1869a305c4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1169,6 +1169,62 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) complete(waiting); } +/* + * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple + * queues. This is important for md arrays to benefit from merging + * requests. + */ +static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) +{ + if (plug->multiple_queues) + return BLK_MAX_REQUEST_COUNT * 2; + return BLK_MAX_REQUEST_COUNT; +} + +static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) +{ + struct request *last = rq_list_peek(&plug->mq_list); + + if (!plug->rq_count) { + trace_block_plug(rq->q); + } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || + (!blk_queue_nomerges(rq->q) && + blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { + blk_mq_flush_plug_list(plug, false); + trace_block_plug(rq->q); + } + + if (!plug->multiple_queues && last && last->q != rq->q) + plug->multiple_queues = true; + if (!plug->has_elevator && (rq->rq_flags & RQF_ELV)) + plug->has_elevator = true; + rq->rq_next = NULL; + rq_list_add(&plug->mq_list, rq); + plug->rq_count++; +} + +static void __blk_execute_rq_nowait(struct request *rq, bool at_head, + rq_end_io_fn *done, bool use_plug) +{ + WARN_ON(irqs_disabled()); + WARN_ON(!blk_rq_is_passthrough(rq)); + + rq->end_io = done; + + blk_account_io_start(rq); + + if (use_plug && current->plug) { + blk_add_rq_to_plug(current->plug, rq); + return; + } + /* + * don't check dying flag for MQ because the request won't + * be reused after dying flag is set + */ + blk_mq_sched_insert_request(rq, at_head, true, false); +} + + /** * blk_execute_rq_nowait - insert a request to I/O scheduler for execution * @rq: request to insert @@ -1184,18 +1240,8 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) */ void blk_execute_rq_nowait(struct request *rq, bool at_head, rq_end_io_fn *done) { - WARN_ON(irqs_disabled()); - WARN_ON(!blk_rq_is_passthrough(rq)); + __blk_execute_rq_nowait(rq, at_head, done, true); - rq->end_io = done; - - blk_account_io_start(rq); - - /* - * don't check dying flag for MQ because the request won't - * be reused after dying flag is set - */ - blk_mq_sched_insert_request(rq, at_head, true, false); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); @@ -1233,8 +1279,13 @@ blk_status_t blk_execute_rq(struct request *rq, bool at_head) DECLARE_COMPLETION_ONSTACK(wait); unsigned long hang_check; + /* + * iopoll requires request to be submitted to driver, so can't + * use plug + */ rq->end_io_data = &wait; - blk_execute_rq_nowait(rq, at_head, blk_end_sync_rq); + __blk_execute_rq_nowait(rq, at_head, blk_end_sync_rq, + !blk_rq_is_poll(rq)); /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; @@ -2676,40 +2727,6 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, hctx->queue->mq_ops->commit_rqs(hctx); } -/* - * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple - * queues. This is important for md arrays to benefit from merging - * requests. - */ -static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) -{ - if (plug->multiple_queues) - return BLK_MAX_REQUEST_COUNT * 2; - return BLK_MAX_REQUEST_COUNT; -} - -static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) -{ - struct request *last = rq_list_peek(&plug->mq_list); - - if (!plug->rq_count) { - trace_block_plug(rq->q); - } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || - (!blk_queue_nomerges(rq->q) && - blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { - blk_mq_flush_plug_list(plug, false); - trace_block_plug(rq->q); - } - - if (!plug->multiple_queues && last && last->q != rq->q) - plug->multiple_queues = true; - if (!plug->has_elevator && (rq->rq_flags & RQF_ELV)) - plug->has_elevator = true; - rq->rq_next = NULL; - rq_list_add(&plug->mq_list, rq); - plug->rq_count++; -} - static bool blk_mq_attempt_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 3ed5eaf3446a..6ed602b2f80a 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -742,6 +742,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, if (at_head) { list_add(&rq->queuelist, &per_prio->dispatch); + rq->fifo_time = jiffies; } else { deadline_add_rq_rb(per_prio, rq); diff --git a/drivers/clk/at91/clk-generated.c b/drivers/clk/at91/clk-generated.c index 23cc8297ec4c..d429ba52a719 100644 --- a/drivers/clk/at91/clk-generated.c +++ b/drivers/clk/at91/clk-generated.c @@ -117,6 +117,10 @@ static void clk_generated_best_diff(struct clk_rate_request *req, tmp_rate = parent_rate; else tmp_rate = parent_rate / div; + + if (tmp_rate < req->min_rate || tmp_rate > req->max_rate) + return; + tmp_diff = abs(req->rate - tmp_rate); if (*best_diff < 0 || *best_diff >= tmp_diff) { diff --git a/drivers/clk/bcm/clk-bcm2835.c b/drivers/clk/bcm/clk-bcm2835.c index 3ad20e75fd23..48a1eb9f2d55 100644 --- a/drivers/clk/bcm/clk-bcm2835.c +++ b/drivers/clk/bcm/clk-bcm2835.c @@ -941,6 +941,7 @@ static u32 bcm2835_clock_choose_div(struct clk_hw *hw, u64 temp = (u64)parent_rate << CM_DIV_FRAC_BITS; u32 div, mindiv, maxdiv; + do_div(temp, rate); div = temp; div &= ~unused_frac_mask; diff --git a/drivers/clk/sunxi-ng/ccu-sun6i-rtc.c b/drivers/clk/sunxi-ng/ccu-sun6i-rtc.c index 2f3ddc908ebd..d65398497d5f 100644 --- a/drivers/clk/sunxi-ng/ccu-sun6i-rtc.c +++ b/drivers/clk/sunxi-ng/ccu-sun6i-rtc.c @@ -298,10 +298,6 @@ static const struct sunxi_ccu_desc sun6i_rtc_ccu_desc = { .hw_clks = &sun6i_rtc_ccu_hw_clks, }; -static const struct clk_parent_data sun50i_h6_osc32k_fanout_parents[] = { - { .hw = &osc32k_clk.common.hw }, -}; - static const struct clk_parent_data sun50i_h616_osc32k_fanout_parents[] = { { .hw = &osc32k_clk.common.hw }, { .fw_name = "pll-32k" }, @@ -314,13 +310,6 @@ static const struct clk_parent_data sun50i_r329_osc32k_fanout_parents[] = { { .hw = &osc24M_32k_clk.common.hw } }; -static const struct sun6i_rtc_match_data sun50i_h6_rtc_ccu_data = { - .have_ext_osc32k = true, - .have_iosc_calibration = true, - .osc32k_fanout_parents = sun50i_h6_osc32k_fanout_parents, - .osc32k_fanout_nparents = ARRAY_SIZE(sun50i_h6_osc32k_fanout_parents), -}; - static const struct sun6i_rtc_match_data sun50i_h616_rtc_ccu_data = { .have_iosc_calibration = true, .rtc_32k_single_parent = true, @@ -336,10 +325,6 @@ static const struct sun6i_rtc_match_data sun50i_r329_rtc_ccu_data = { static const struct of_device_id sun6i_rtc_ccu_match[] = { { - .compatible = "allwinner,sun50i-h6-rtc", - .data = &sun50i_h6_rtc_ccu_data, - }, - { .compatible = "allwinner,sun50i-h616-rtc", .data = &sun50i_h616_rtc_ccu_data, }, diff --git a/drivers/crypto/qcom-rng.c b/drivers/crypto/qcom-rng.c index 11f30fd48c14..031b5f701a0a 100644 --- a/drivers/crypto/qcom-rng.c +++ b/drivers/crypto/qcom-rng.c @@ -65,6 +65,7 @@ static int qcom_rng_read(struct qcom_rng *rng, u8 *data, unsigned int max) } else { /* copy only remaining bytes */ memcpy(data, &val, max - currsize); + break; } } while (currsize < max); diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index b1e25ae98302..53297a0d9c57 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -407,6 +407,7 @@ static inline int is_dma_buf_file(struct file *file) static struct file *dma_buf_getfile(struct dma_buf *dmabuf, int flags) { + static atomic64_t dmabuf_inode = ATOMIC64_INIT(0); struct file *file; struct inode *inode = alloc_anon_inode(dma_buf_mnt->mnt_sb); @@ -416,6 +417,13 @@ static struct file *dma_buf_getfile(struct dma_buf *dmabuf, int flags) inode->i_size = dmabuf->size; inode_set_bytes(inode, dmabuf->size); + /* + * The ->i_ino acquired from get_next_ino() is not unique thus + * not suitable for using it as dentry name by dmabuf stats. + * Override ->i_ino with the unique and dmabuffs specific + * value. + */ + inode->i_ino = atomic64_add_return(1, &dmabuf_inode); file = alloc_file_pseudo(inode, dma_buf_mnt, "dmabuf", flags, &dma_buf_fops); if (IS_ERR(file)) diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index 2c3dac5ecb36..4720ba98cec3 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -91,6 +91,18 @@ config EFI_SOFT_RESERVE If unsure, say Y. +config EFI_DXE_MEM_ATTRIBUTES + bool "Adjust memory attributes in EFISTUB" + depends on EFI && EFI_STUB && X86 + default y + help + UEFI specification does not guarantee all memory to be + accessible for both write and execute as the kernel expects + it to be. + Use DXE services to check and alter memory protection + attributes during boot via EFISTUB to ensure that memory + ranges used by the kernel are writable and executable. + config EFI_PARAMS_FROM_FDT bool help @@ -284,3 +296,34 @@ config EFI_CUSTOM_SSDT_OVERLAYS See Documentation/admin-guide/acpi/ssdt-overlays.rst for more information. + +config EFI_DISABLE_RUNTIME + bool "Disable EFI runtime services support by default" + default y if PREEMPT_RT + help + Allow to disable the EFI runtime services support by default. This can + already be achieved by using the efi=noruntime option, but it could be + useful to have this default without any kernel command line parameter. + + The EFI runtime services are disabled by default when PREEMPT_RT is + enabled, because measurements have shown that some EFI functions calls + might take too much time to complete, causing large latencies which is + an issue for Real-Time kernels. + + This default can be overridden by using the efi=runtime option. + +config EFI_COCO_SECRET + bool "EFI Confidential Computing Secret Area Support" + depends on EFI + help + Confidential Computing platforms (such as AMD SEV) allow the + Guest Owner to securely inject secrets during guest VM launch. + The secrets are placed in a designated EFI reserved memory area. + + In order to use the secrets in the kernel, the location of the secret + area (as published in the EFI config table) must be kept. + + If you say Y here, the address of the EFI secret area will be kept + for usage inside the kernel. This will allow the + virt/coco/efi_secret module to access the secrets, which in turn + allows userspace programs to access the injected secrets. diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 5502e176d51b..860534bcfdac 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -46,6 +46,9 @@ struct efi __read_mostly efi = { #ifdef CONFIG_LOAD_UEFI_KEYS .mokvar_table = EFI_INVALID_TABLE_ADDR, #endif +#ifdef CONFIG_EFI_COCO_SECRET + .coco_secret = EFI_INVALID_TABLE_ADDR, +#endif }; EXPORT_SYMBOL(efi); @@ -66,7 +69,7 @@ struct mm_struct efi_mm = { struct workqueue_struct *efi_rts_wq; -static bool disable_runtime = IS_ENABLED(CONFIG_PREEMPT_RT); +static bool disable_runtime = IS_ENABLED(CONFIG_EFI_DISABLE_RUNTIME); static int __init setup_noefi(char *arg) { disable_runtime = true; @@ -422,6 +425,11 @@ static int __init efisubsys_init(void) if (efi_enabled(EFI_DBG) && efi_enabled(EFI_PRESERVE_BS_REGIONS)) efi_debugfs_init(); +#ifdef CONFIG_EFI_COCO_SECRET + if (efi.coco_secret != EFI_INVALID_TABLE_ADDR) + platform_device_register_simple("efi_secret", 0, NULL, 0); +#endif + return 0; err_remove_group: @@ -529,6 +537,9 @@ static const efi_config_table_type_t common_tables[] __initconst = { #ifdef CONFIG_LOAD_UEFI_KEYS {LINUX_EFI_MOK_VARIABLE_TABLE_GUID, &efi.mokvar_table, "MOKvar" }, #endif +#ifdef CONFIG_EFI_COCO_SECRET + {LINUX_EFI_COCO_SECRET_AREA_GUID, &efi.coco_secret, "CocoSecret" }, +#endif {}, }; diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c index 4b5b2403b3a0..0131e3aaa605 100644 --- a/drivers/firmware/efi/libstub/arm32-stub.c +++ b/drivers/firmware/efi/libstub/arm32-stub.c @@ -117,7 +117,8 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, unsigned long *image_size, unsigned long *reserve_addr, unsigned long *reserve_size, - efi_loaded_image_t *image) + efi_loaded_image_t *image, + efi_handle_t image_handle) { const int slack = TEXT_OFFSET - 5 * PAGE_SIZE; int alloc_size = MAX_UNCOMP_KERNEL_SIZE + EFI_PHYS_ALIGN; diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 9cc556013d08..577173ee1f83 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -83,7 +83,8 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, unsigned long *image_size, unsigned long *reserve_addr, unsigned long *reserve_size, - efi_loaded_image_t *image) + efi_loaded_image_t *image, + efi_handle_t image_handle) { efi_status_t status; unsigned long kernel_size, kernel_memsize = 0; @@ -100,7 +101,15 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, u64 min_kimg_align = efi_nokaslr ? MIN_KIMG_ALIGN : EFI_KIMG_ALIGN; if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { - if (!efi_nokaslr) { + efi_guid_t li_fixed_proto = LINUX_EFI_LOADED_IMAGE_FIXED_GUID; + void *p; + + if (efi_nokaslr) { + efi_info("KASLR disabled on kernel command line\n"); + } else if (efi_bs_call(handle_protocol, image_handle, + &li_fixed_proto, &p) == EFI_SUCCESS) { + efi_info("Image placement fixed by loader\n"); + } else { status = efi_get_random_bytes(sizeof(phys_seed), (u8 *)&phys_seed); if (status == EFI_NOT_FOUND) { @@ -111,8 +120,6 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, status); efi_nokaslr = true; } - } else { - efi_info("KASLR disabled on kernel command line\n"); } } diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index da93864d7abc..f515394cce6e 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -198,7 +198,7 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, status = handle_kernel_image(&image_addr, &image_size, &reserve_addr, &reserve_size, - image); + image, handle); if (status != EFI_SUCCESS) { efi_err("Failed to relocate kernel\n"); goto fail_free_screeninfo; diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index edb77b0621ea..b0ae0a454404 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -36,6 +36,9 @@ extern bool efi_novamap; extern const efi_system_table_t *efi_system_table; +typedef union efi_dxe_services_table efi_dxe_services_table_t; +extern const efi_dxe_services_table_t *efi_dxe_table; + efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg); @@ -44,6 +47,7 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, #define efi_is_native() (true) #define efi_bs_call(func, ...) efi_system_table->boottime->func(__VA_ARGS__) #define efi_rt_call(func, ...) efi_system_table->runtime->func(__VA_ARGS__) +#define efi_dxe_call(func, ...) efi_dxe_table->func(__VA_ARGS__) #define efi_table_attr(inst, attr) (inst->attr) #define efi_call_proto(inst, func, ...) inst->func(inst, ##__VA_ARGS__) @@ -329,6 +333,76 @@ union efi_boot_services { } mixed_mode; }; +typedef enum { + EfiGcdMemoryTypeNonExistent, + EfiGcdMemoryTypeReserved, + EfiGcdMemoryTypeSystemMemory, + EfiGcdMemoryTypeMemoryMappedIo, + EfiGcdMemoryTypePersistent, + EfiGcdMemoryTypeMoreReliable, + EfiGcdMemoryTypeMaximum +} efi_gcd_memory_type_t; + +typedef struct { + efi_physical_addr_t base_address; + u64 length; + u64 capabilities; + u64 attributes; + efi_gcd_memory_type_t gcd_memory_type; + void *image_handle; + void *device_handle; +} efi_gcd_memory_space_desc_t; + +/* + * EFI DXE Services table + */ +union efi_dxe_services_table { + struct { + efi_table_hdr_t hdr; + void *add_memory_space; + void *allocate_memory_space; + void *free_memory_space; + void *remove_memory_space; + efi_status_t (__efiapi *get_memory_space_descriptor)(efi_physical_addr_t, + efi_gcd_memory_space_desc_t *); + efi_status_t (__efiapi *set_memory_space_attributes)(efi_physical_addr_t, + u64, u64); + void *get_memory_space_map; + void *add_io_space; + void *allocate_io_space; + void *free_io_space; + void *remove_io_space; + void *get_io_space_descriptor; + void *get_io_space_map; + void *dispatch; + void *schedule; + void *trust; + void *process_firmware_volume; + void *set_memory_space_capabilities; + }; + struct { + efi_table_hdr_t hdr; + u32 add_memory_space; + u32 allocate_memory_space; + u32 free_memory_space; + u32 remove_memory_space; + u32 get_memory_space_descriptor; + u32 set_memory_space_attributes; + u32 get_memory_space_map; + u32 add_io_space; + u32 allocate_io_space; + u32 free_io_space; + u32 remove_io_space; + u32 get_io_space_descriptor; + u32 get_io_space_map; + u32 dispatch; + u32 schedule; + u32 trust; + u32 process_firmware_volume; + u32 set_memory_space_capabilities; + } mixed_mode; +}; + typedef union efi_uga_draw_protocol efi_uga_draw_protocol_t; union efi_uga_draw_protocol { @@ -720,6 +794,13 @@ union efi_tcg2_protocol { } mixed_mode; }; +struct riscv_efi_boot_protocol { + u64 revision; + + efi_status_t (__efiapi *get_boot_hartid)(struct riscv_efi_boot_protocol *, + unsigned long *boot_hartid); +}; + typedef union efi_load_file_protocol efi_load_file_protocol_t; typedef union efi_load_file_protocol efi_load_file2_protocol_t; @@ -865,7 +946,8 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, unsigned long *image_size, unsigned long *reserve_addr, unsigned long *reserve_size, - efi_loaded_image_t *image); + efi_loaded_image_t *image, + efi_handle_t image_handle); asmlinkage void __noreturn efi_enter_kernel(unsigned long entrypoint, unsigned long fdt_addr, diff --git a/drivers/firmware/efi/libstub/randomalloc.c b/drivers/firmware/efi/libstub/randomalloc.c index 724155b9e10d..715f37479154 100644 --- a/drivers/firmware/efi/libstub/randomalloc.c +++ b/drivers/firmware/efi/libstub/randomalloc.c @@ -56,6 +56,7 @@ efi_status_t efi_random_alloc(unsigned long size, unsigned long random_seed) { unsigned long map_size, desc_size, total_slots = 0, target_slot; + unsigned long total_mirrored_slots = 0; unsigned long buff_size; efi_status_t status; efi_memory_desc_t *memory_map; @@ -86,8 +87,14 @@ efi_status_t efi_random_alloc(unsigned long size, slots = get_entry_num_slots(md, size, ilog2(align)); MD_NUM_SLOTS(md) = slots; total_slots += slots; + if (md->attribute & EFI_MEMORY_MORE_RELIABLE) + total_mirrored_slots += slots; } + /* consider only mirrored slots for randomization if any exist */ + if (total_mirrored_slots > 0) + total_slots = total_mirrored_slots; + /* find a random number between 0 and total_slots */ target_slot = (total_slots * (u64)(random_seed & U32_MAX)) >> 32; @@ -107,6 +114,10 @@ efi_status_t efi_random_alloc(unsigned long size, efi_physical_addr_t target; unsigned long pages; + if (total_mirrored_slots > 0 && + !(md->attribute & EFI_MEMORY_MORE_RELIABLE)) + continue; + if (target_slot >= MD_NUM_SLOTS(md)) { target_slot -= MD_NUM_SLOTS(md); continue; diff --git a/drivers/firmware/efi/libstub/riscv-stub.c b/drivers/firmware/efi/libstub/riscv-stub.c index 9c460843442f..9e85e58d1f27 100644 --- a/drivers/firmware/efi/libstub/riscv-stub.c +++ b/drivers/firmware/efi/libstub/riscv-stub.c @@ -21,9 +21,9 @@ #define MIN_KIMG_ALIGN SZ_4M #endif -typedef void __noreturn (*jump_kernel_func)(unsigned int, unsigned long); +typedef void __noreturn (*jump_kernel_func)(unsigned long, unsigned long); -static u32 hartid; +static unsigned long hartid; static int get_boot_hartid_from_fdt(void) { @@ -47,14 +47,31 @@ static int get_boot_hartid_from_fdt(void) return 0; } +static efi_status_t get_boot_hartid_from_efi(void) +{ + efi_guid_t boot_protocol_guid = RISCV_EFI_BOOT_PROTOCOL_GUID; + struct riscv_efi_boot_protocol *boot_protocol; + efi_status_t status; + + status = efi_bs_call(locate_protocol, &boot_protocol_guid, NULL, + (void **)&boot_protocol); + if (status != EFI_SUCCESS) + return status; + return efi_call_proto(boot_protocol, get_boot_hartid, &hartid); +} + efi_status_t check_platform_features(void) { + efi_status_t status; int ret; - ret = get_boot_hartid_from_fdt(); - if (ret) { - efi_err("/chosen/boot-hartid missing or invalid!\n"); - return EFI_UNSUPPORTED; + status = get_boot_hartid_from_efi(); + if (status != EFI_SUCCESS) { + ret = get_boot_hartid_from_fdt(); + if (ret) { + efi_err("Failed to get boot hartid!\n"); + return EFI_UNSUPPORTED; + } } return EFI_SUCCESS; } @@ -80,7 +97,8 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, unsigned long *image_size, unsigned long *reserve_addr, unsigned long *reserve_size, - efi_loaded_image_t *image) + efi_loaded_image_t *image, + efi_handle_t image_handle) { unsigned long kernel_size = 0; unsigned long preferred_addr; diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 01ddd4502e28..b14e88ccefca 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -22,6 +22,7 @@ #define MAXMEM_X86_64_4LEVEL (1ull << 46) const efi_system_table_t *efi_system_table; +const efi_dxe_services_table_t *efi_dxe_table; extern u32 image_offset; static efi_loaded_image_t *image = NULL; @@ -211,9 +212,110 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params) } } +static void +adjust_memory_range_protection(unsigned long start, unsigned long size) +{ + efi_status_t status; + efi_gcd_memory_space_desc_t desc; + unsigned long end, next; + unsigned long rounded_start, rounded_end; + unsigned long unprotect_start, unprotect_size; + int has_system_memory = 0; + + if (efi_dxe_table == NULL) + return; + + rounded_start = rounddown(start, EFI_PAGE_SIZE); + rounded_end = roundup(start + size, EFI_PAGE_SIZE); + + /* + * Don't modify memory region attributes, they are + * already suitable, to lower the possibility to + * encounter firmware bugs. + */ + + for (end = start + size; start < end; start = next) { + + status = efi_dxe_call(get_memory_space_descriptor, start, &desc); + + if (status != EFI_SUCCESS) + return; + + next = desc.base_address + desc.length; + + /* + * Only system memory is suitable for trampoline/kernel image placement, + * so only this type of memory needs its attributes to be modified. + */ + + if (desc.gcd_memory_type != EfiGcdMemoryTypeSystemMemory || + (desc.attributes & (EFI_MEMORY_RO | EFI_MEMORY_XP)) == 0) + continue; + + unprotect_start = max(rounded_start, (unsigned long)desc.base_address); + unprotect_size = min(rounded_end, next) - unprotect_start; + + status = efi_dxe_call(set_memory_space_attributes, + unprotect_start, unprotect_size, + EFI_MEMORY_WB); + + if (status != EFI_SUCCESS) { + efi_warn("Unable to unprotect memory range [%08lx,%08lx]: %d\n", + unprotect_start, + unprotect_start + unprotect_size, + (int)status); + } + } +} + +/* + * Trampoline takes 2 pages and can be loaded in first megabyte of memory + * with its end placed between 128k and 640k where BIOS might start. + * (see arch/x86/boot/compressed/pgtable_64.c) + * + * We cannot find exact trampoline placement since memory map + * can be modified by UEFI, and it can alter the computed address. + */ + +#define TRAMPOLINE_PLACEMENT_BASE ((128 - 8)*1024) +#define TRAMPOLINE_PLACEMENT_SIZE (640*1024 - (128 - 8)*1024) + +void startup_32(struct boot_params *boot_params); + +static void +setup_memory_protection(unsigned long image_base, unsigned long image_size) +{ + /* + * Allow execution of possible trampoline used + * for switching between 4- and 5-level page tables + * and relocated kernel image. + */ + + adjust_memory_range_protection(TRAMPOLINE_PLACEMENT_BASE, + TRAMPOLINE_PLACEMENT_SIZE); + +#ifdef CONFIG_64BIT + if (image_base != (unsigned long)startup_32) + adjust_memory_range_protection(image_base, image_size); +#else + /* + * Clear protection flags on a whole range of possible + * addresses used for KASLR. We don't need to do that + * on x86_64, since KASLR/extraction is performed after + * dedicated identity page tables are built and we only + * need to remove possible protection on relocated image + * itself disregarding further relocations. + */ + adjust_memory_range_protection(LOAD_PHYSICAL_ADDR, + KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR); +#endif +} + static const efi_char16_t apple[] = L"Apple"; -static void setup_quirks(struct boot_params *boot_params) +static void setup_quirks(struct boot_params *boot_params, + unsigned long image_base, + unsigned long image_size) { efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long) efi_table_attr(efi_system_table, fw_vendor); @@ -222,6 +324,9 @@ static void setup_quirks(struct boot_params *boot_params) if (IS_ENABLED(CONFIG_APPLE_PROPERTIES)) retrieve_apple_device_properties(boot_params); } + + if (IS_ENABLED(CONFIG_EFI_DXE_MEM_ATTRIBUTES)) + setup_memory_protection(image_base, image_size); } /* @@ -341,8 +446,6 @@ static void __noreturn efi_exit(efi_handle_t handle, efi_status_t status) asm("hlt"); } -void startup_32(struct boot_params *boot_params); - void __noreturn efi_stub_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg, struct boot_params *boot_params); @@ -677,11 +780,17 @@ unsigned long efi_main(efi_handle_t handle, efi_status_t status; efi_system_table = sys_table_arg; - /* Check if we were booted by the EFI firmware */ if (efi_system_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) efi_exit(handle, EFI_INVALID_PARAMETER); + efi_dxe_table = get_efi_config_table(EFI_DXE_SERVICES_TABLE_GUID); + if (efi_dxe_table && + efi_dxe_table->hdr.signature != EFI_DXE_SERVICES_TABLE_SIGNATURE) { + efi_warn("Ignoring DXE services table: invalid signature\n"); + efi_dxe_table = NULL; + } + /* * If the kernel isn't already loaded at a suitable address, * relocate it. @@ -791,7 +900,7 @@ unsigned long efi_main(efi_handle_t handle, setup_efi_pci(boot_params); - setup_quirks(boot_params); + setup_quirks(boot_params, bzimage_addr, buffer_end - buffer_start); status = exit_boot(boot_params, handle); if (status != EFI_SUCCESS) { diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c index a2c8dd329b31..2db19cd640a4 100644 --- a/drivers/gpio/gpio-mvebu.c +++ b/drivers/gpio/gpio-mvebu.c @@ -707,6 +707,9 @@ static int mvebu_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, unsigned long flags; unsigned int on, off; + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + val = (unsigned long long) mvpwm->clk_rate * state->duty_cycle; do_div(val, NSEC_PER_SEC); if (val > UINT_MAX + 1ULL) diff --git a/drivers/gpio/gpio-vf610.c b/drivers/gpio/gpio-vf610.c index 20780c35da1b..23cddb265a0d 100644 --- a/drivers/gpio/gpio-vf610.c +++ b/drivers/gpio/gpio-vf610.c @@ -125,9 +125,13 @@ static int vf610_gpio_direction_output(struct gpio_chip *chip, unsigned gpio, { struct vf610_gpio_port *port = gpiochip_get_data(chip); unsigned long mask = BIT(gpio); + u32 val; - if (port->sdata && port->sdata->have_paddr) - vf610_gpio_writel(mask, port->gpio_base + GPIO_PDDR); + if (port->sdata && port->sdata->have_paddr) { + val = vf610_gpio_readl(port->gpio_base + GPIO_PDDR); + val |= mask; + vf610_gpio_writel(val, port->gpio_base + GPIO_PDDR); + } vf610_gpio_set(chip, gpio, value); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index cdf0818088b3..7606e3b6361e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1342,9 +1342,11 @@ static inline int amdgpu_acpi_smart_shift_update(struct drm_device *dev, #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev); +bool amdgpu_acpi_should_gpu_reset(struct amdgpu_device *adev); bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev); #else static inline bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) { return false; } +static inline bool amdgpu_acpi_should_gpu_reset(struct amdgpu_device *adev) { return false; } static inline bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev) { return false; } #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c index 0e12315fa0cb..98ac53ee6bb5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c @@ -1046,6 +1046,20 @@ bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev) } /** + * amdgpu_acpi_should_gpu_reset + * + * @adev: amdgpu_device_pointer + * + * returns true if should reset GPU, false if not + */ +bool amdgpu_acpi_should_gpu_reset(struct amdgpu_device *adev) +{ + if (adev->flags & AMD_IS_APU) + return false; + return pm_suspend_target_state != PM_SUSPEND_TO_IDLE; +} + +/** * amdgpu_acpi_is_s0ix_active * * @adev: amdgpu_device_pointer diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 7fd0277b2805..46ef57b07c15 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2336,7 +2336,7 @@ static int amdgpu_pmops_suspend_noirq(struct device *dev) struct drm_device *drm_dev = dev_get_drvdata(dev); struct amdgpu_device *adev = drm_to_adev(drm_dev); - if (!adev->in_s0ix) + if (amdgpu_acpi_should_gpu_reset(adev)) return amdgpu_asic_reset(adev); return 0; diff --git a/drivers/gpu/drm/dp/drm_dp_mst_topology.c b/drivers/gpu/drm/dp/drm_dp_mst_topology.c index 11300b53d24f..7a7cc44686f9 100644 --- a/drivers/gpu/drm/dp/drm_dp_mst_topology.c +++ b/drivers/gpu/drm/dp/drm_dp_mst_topology.c @@ -4852,6 +4852,7 @@ static void fetch_monitor_name(struct drm_dp_mst_topology_mgr *mgr, mst_edid = drm_dp_mst_get_edid(port->connector, mgr, port); drm_edid_get_monitor_name(mst_edid, name, namelen); + kfree(mst_edid); } /** diff --git a/drivers/gpu/drm/i915/display/intel_dmc.c b/drivers/gpu/drm/i915/display/intel_dmc.c index 7616a3906b9e..1b774dcfb281 100644 --- a/drivers/gpu/drm/i915/display/intel_dmc.c +++ b/drivers/gpu/drm/i915/display/intel_dmc.c @@ -367,6 +367,44 @@ static void dmc_set_fw_offset(struct intel_dmc *dmc, } } +static bool dmc_mmio_addr_sanity_check(struct intel_dmc *dmc, + const u32 *mmioaddr, u32 mmio_count, + int header_ver, u8 dmc_id) +{ + struct drm_i915_private *i915 = container_of(dmc, typeof(*i915), dmc); + u32 start_range, end_range; + int i; + + if (dmc_id >= DMC_FW_MAX) { + drm_warn(&i915->drm, "Unsupported firmware id %u\n", dmc_id); + return false; + } + + if (header_ver == 1) { + start_range = DMC_MMIO_START_RANGE; + end_range = DMC_MMIO_END_RANGE; + } else if (dmc_id == DMC_FW_MAIN) { + start_range = TGL_MAIN_MMIO_START; + end_range = TGL_MAIN_MMIO_END; + } else if (DISPLAY_VER(i915) >= 13) { + start_range = ADLP_PIPE_MMIO_START; + end_range = ADLP_PIPE_MMIO_END; + } else if (DISPLAY_VER(i915) >= 12) { + start_range = TGL_PIPE_MMIO_START(dmc_id); + end_range = TGL_PIPE_MMIO_END(dmc_id); + } else { + drm_warn(&i915->drm, "Unknown mmio range for sanity check"); + return false; + } + + for (i = 0; i < mmio_count; i++) { + if (mmioaddr[i] < start_range || mmioaddr[i] > end_range) + return false; + } + + return true; +} + static u32 parse_dmc_fw_header(struct intel_dmc *dmc, const struct intel_dmc_header_base *dmc_header, size_t rem_size, u8 dmc_id) @@ -436,6 +474,12 @@ static u32 parse_dmc_fw_header(struct intel_dmc *dmc, return 0; } + if (!dmc_mmio_addr_sanity_check(dmc, mmioaddr, mmio_count, + dmc_header->header_ver, dmc_id)) { + drm_err(&i915->drm, "DMC firmware has Wrong MMIO Addresses\n"); + return 0; + } + for (i = 0; i < mmio_count; i++) { dmc_info->mmioaddr[i] = _MMIO(mmioaddr[i]); dmc_info->mmiodata[i] = mmiodata[i]; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index d42f437149c9..6ca8929cf6e1 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -1252,14 +1252,12 @@ static void *reloc_iomap(struct i915_vma *batch, * Only attempt to pin the batch buffer to ggtt if the current batch * is not inside ggtt, or the batch buffer is not misplaced. */ - if (!i915_is_ggtt(batch->vm)) { + if (!i915_is_ggtt(batch->vm) || + !i915_vma_misplaced(batch, 0, 0, PIN_MAPPABLE)) { vma = i915_gem_object_ggtt_pin_ww(obj, &eb->ww, NULL, 0, 0, PIN_MAPPABLE | PIN_NONBLOCK /* NOWARN */ | PIN_NOEVICT); - } else if (i915_vma_is_map_and_fenceable(batch)) { - __i915_vma_pin(batch); - vma = batch; } if (vma == ERR_PTR(-EDEADLK)) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 82713264b96c..b7c6d4462ec5 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -806,7 +806,7 @@ static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask) __intel_engine_reset(engine, stalled_mask & engine->mask); local_bh_enable(); - intel_uc_reset(>->uc, true); + intel_uc_reset(>->uc, ALL_ENGINES); intel_ggtt_restore_fences(gt->ggtt); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h index bf7079480d47..2488d1197f3e 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h @@ -438,7 +438,7 @@ int intel_guc_global_policies_update(struct intel_guc *guc); void intel_guc_context_ban(struct intel_context *ce, struct i915_request *rq); void intel_guc_submission_reset_prepare(struct intel_guc *guc); -void intel_guc_submission_reset(struct intel_guc *guc, bool stalled); +void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled); void intel_guc_submission_reset_finish(struct intel_guc *guc); void intel_guc_submission_cancel_requests(struct intel_guc *guc); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 1ce7e04aa837..28f9aac0201d 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1590,9 +1590,9 @@ __unwind_incomplete_requests(struct intel_context *ce) spin_unlock_irqrestore(&sched_engine->lock, flags); } -static void __guc_reset_context(struct intel_context *ce, bool stalled) +static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t stalled) { - bool local_stalled; + bool guilty; struct i915_request *rq; unsigned long flags; u32 head; @@ -1620,7 +1620,7 @@ static void __guc_reset_context(struct intel_context *ce, bool stalled) if (!intel_context_is_pinned(ce)) goto next_context; - local_stalled = false; + guilty = false; rq = intel_context_find_active_request(ce); if (!rq) { head = ce->ring->tail; @@ -1628,14 +1628,14 @@ static void __guc_reset_context(struct intel_context *ce, bool stalled) } if (i915_request_started(rq)) - local_stalled = true; + guilty = stalled & ce->engine->mask; GEM_BUG_ON(i915_active_is_idle(&ce->active)); head = intel_ring_wrap(ce->ring, rq->head); - __i915_request_reset(rq, local_stalled && stalled); + __i915_request_reset(rq, guilty); out_replay: - guc_reset_state(ce, head, local_stalled && stalled); + guc_reset_state(ce, head, guilty); next_context: if (i != number_children) ce = list_next_entry(ce, parallel.child_link); @@ -1645,7 +1645,7 @@ next_context: intel_context_put(parent); } -void intel_guc_submission_reset(struct intel_guc *guc, bool stalled) +void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled) { struct intel_context *ce; unsigned long index; @@ -4013,7 +4013,7 @@ static void guc_context_replay(struct intel_context *ce) { struct i915_sched_engine *sched_engine = ce->engine->sched_engine; - __guc_reset_context(ce, true); + __guc_reset_context(ce, ce->engine->mask); tasklet_hi_schedule(&sched_engine->tasklet); } diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c index da199aa6989f..8eb34de2f20c 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c @@ -593,7 +593,7 @@ sanitize: __uc_sanitize(uc); } -void intel_uc_reset(struct intel_uc *uc, bool stalled) +void intel_uc_reset(struct intel_uc *uc, intel_engine_mask_t stalled) { struct intel_guc *guc = &uc->guc; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.h b/drivers/gpu/drm/i915/gt/uc/intel_uc.h index 866b462821c0..a8f38c2c60e2 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.h @@ -42,7 +42,7 @@ void intel_uc_driver_late_release(struct intel_uc *uc); void intel_uc_driver_remove(struct intel_uc *uc); void intel_uc_init_mmio(struct intel_uc *uc); void intel_uc_reset_prepare(struct intel_uc *uc); -void intel_uc_reset(struct intel_uc *uc, bool stalled); +void intel_uc_reset(struct intel_uc *uc, intel_engine_mask_t stalled); void intel_uc_reset_finish(struct intel_uc *uc); void intel_uc_cancel_requests(struct intel_uc *uc); void intel_uc_suspend(struct intel_uc *uc); diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index a9354f8f110d..fe960c204362 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -5501,6 +5501,22 @@ /* MMIO address range for DMC program (0x80000 - 0x82FFF) */ #define DMC_MMIO_START_RANGE 0x80000 #define DMC_MMIO_END_RANGE 0x8FFFF +#define DMC_V1_MMIO_START_RANGE 0x80000 +#define TGL_MAIN_MMIO_START 0x8F000 +#define TGL_MAIN_MMIO_END 0x8FFFF +#define _TGL_PIPEA_MMIO_START 0x92000 +#define _TGL_PIPEA_MMIO_END 0x93FFF +#define _TGL_PIPEB_MMIO_START 0x96000 +#define _TGL_PIPEB_MMIO_END 0x97FFF +#define ADLP_PIPE_MMIO_START 0x5F000 +#define ADLP_PIPE_MMIO_END 0x5FFFF + +#define TGL_PIPE_MMIO_START(dmc_id) _PICK_EVEN(((dmc_id) - 1), _TGL_PIPEA_MMIO_START,\ + _TGL_PIPEB_MMIO_START) + +#define TGL_PIPE_MMIO_END(dmc_id) _PICK_EVEN(((dmc_id) - 1), _TGL_PIPEA_MMIO_END,\ + _TGL_PIPEB_MMIO_END) + #define SKL_DMC_DC3_DC5_COUNT _MMIO(0x80030) #define SKL_DMC_DC5_DC6_COUNT _MMIO(0x8002C) #define BXT_DMC_DC3_DC5_COUNT _MMIO(0x80038) diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c index c0364314877e..c16157ee8c52 100644 --- a/drivers/i2c/busses/i2c-ismt.c +++ b/drivers/i2c/busses/i2c-ismt.c @@ -82,6 +82,7 @@ #define ISMT_DESC_ENTRIES 2 /* number of descriptor entries */ #define ISMT_MAX_RETRIES 3 /* number of SMBus retries to attempt */ +#define ISMT_LOG_ENTRIES 3 /* number of interrupt cause log entries */ /* Hardware Descriptor Constants - Control Field */ #define ISMT_DESC_CWRL 0x01 /* Command/Write Length */ @@ -175,6 +176,8 @@ struct ismt_priv { u8 head; /* ring buffer head pointer */ struct completion cmp; /* interrupt completion */ u8 buffer[I2C_SMBUS_BLOCK_MAX + 16]; /* temp R/W data buffer */ + dma_addr_t log_dma; + u32 *log; }; static const struct pci_device_id ismt_ids[] = { @@ -411,6 +414,9 @@ static int ismt_access(struct i2c_adapter *adap, u16 addr, memset(desc, 0, sizeof(struct ismt_desc)); desc->tgtaddr_rw = ISMT_DESC_ADDR_RW(addr, read_write); + /* Always clear the log entries */ + memset(priv->log, 0, ISMT_LOG_ENTRIES * sizeof(u32)); + /* Initialize common control bits */ if (likely(pci_dev_msi_enabled(priv->pci_dev))) desc->control = ISMT_DESC_INT | ISMT_DESC_FAIR; @@ -708,6 +714,8 @@ static void ismt_hw_init(struct ismt_priv *priv) /* initialize the Master Descriptor Base Address (MDBA) */ writeq(priv->io_rng_dma, priv->smba + ISMT_MSTR_MDBA); + writeq(priv->log_dma, priv->smba + ISMT_GR_SMTICL); + /* initialize the Master Control Register (MCTRL) */ writel(ISMT_MCTRL_MEIE, priv->smba + ISMT_MSTR_MCTRL); @@ -795,6 +803,12 @@ static int ismt_dev_init(struct ismt_priv *priv) priv->head = 0; init_completion(&priv->cmp); + priv->log = dmam_alloc_coherent(&priv->pci_dev->dev, + ISMT_LOG_ENTRIES * sizeof(u32), + &priv->log_dma, GFP_KERNEL); + if (!priv->log) + return -ENOMEM; + return 0; } diff --git a/drivers/i2c/busses/i2c-mt7621.c b/drivers/i2c/busses/i2c-mt7621.c index 45fe4a7fe0c0..901f0fb04fee 100644 --- a/drivers/i2c/busses/i2c-mt7621.c +++ b/drivers/i2c/busses/i2c-mt7621.c @@ -304,7 +304,8 @@ static int mtk_i2c_probe(struct platform_device *pdev) if (i2c->bus_freq == 0) { dev_warn(i2c->dev, "clock-frequency 0 not supported\n"); - return -EINVAL; + ret = -EINVAL; + goto err_disable_clk; } adap = &i2c->adap; @@ -322,10 +323,15 @@ static int mtk_i2c_probe(struct platform_device *pdev) ret = i2c_add_adapter(adap); if (ret < 0) - return ret; + goto err_disable_clk; dev_info(&pdev->dev, "clock %u kHz\n", i2c->bus_freq / 1000); + return 0; + +err_disable_clk: + clk_disable_unprepare(i2c->clk); + return ret; } diff --git a/drivers/i2c/busses/i2c-thunderx-pcidrv.c b/drivers/i2c/busses/i2c-thunderx-pcidrv.c index 12c90aa0900e..a77cd86fe75e 100644 --- a/drivers/i2c/busses/i2c-thunderx-pcidrv.c +++ b/drivers/i2c/busses/i2c-thunderx-pcidrv.c @@ -213,6 +213,7 @@ static int thunder_i2c_probe_pci(struct pci_dev *pdev, i2c->adap.bus_recovery_info = &octeon_i2c_recovery_info; i2c->adap.dev.parent = dev; i2c->adap.dev.of_node = pdev->dev.of_node; + i2c->adap.dev.fwnode = dev->fwnode; snprintf(i2c->adap.name, sizeof(i2c->adap.name), "Cavium ThunderX i2c adapter at %s", dev_name(dev)); i2c_set_adapdata(&i2c->adap, i2c); diff --git a/drivers/input/touchscreen/ili210x.c b/drivers/input/touchscreen/ili210x.c index 2bd407d86bae..e9bd36adbe47 100644 --- a/drivers/input/touchscreen/ili210x.c +++ b/drivers/input/touchscreen/ili210x.c @@ -756,15 +756,12 @@ static int ili251x_firmware_reset(struct i2c_client *client) return ili251x_firmware_busy(client); } -static void ili251x_hardware_reset(struct device *dev) +static void ili210x_hardware_reset(struct gpio_desc *reset_gpio) { - struct i2c_client *client = to_i2c_client(dev); - struct ili210x *priv = i2c_get_clientdata(client); - /* Reset the controller */ - gpiod_set_value_cansleep(priv->reset_gpio, 1); - usleep_range(10000, 15000); - gpiod_set_value_cansleep(priv->reset_gpio, 0); + gpiod_set_value_cansleep(reset_gpio, 1); + usleep_range(12000, 15000); + gpiod_set_value_cansleep(reset_gpio, 0); msleep(300); } @@ -773,6 +770,7 @@ static ssize_t ili210x_firmware_update_store(struct device *dev, const char *buf, size_t count) { struct i2c_client *client = to_i2c_client(dev); + struct ili210x *priv = i2c_get_clientdata(client); const char *fwname = ILI251X_FW_FILENAME; const struct firmware *fw; u16 ac_end, df_end; @@ -803,7 +801,7 @@ static ssize_t ili210x_firmware_update_store(struct device *dev, dev_dbg(dev, "Firmware update started, firmware=%s\n", fwname); - ili251x_hardware_reset(dev); + ili210x_hardware_reset(priv->reset_gpio); error = ili251x_firmware_reset(client); if (error) @@ -858,7 +856,7 @@ static ssize_t ili210x_firmware_update_store(struct device *dev, error = count; exit: - ili251x_hardware_reset(dev); + ili210x_hardware_reset(priv->reset_gpio); dev_dbg(dev, "Firmware update ended, error=%i\n", error); enable_irq(client->irq); kfree(fwbuf); @@ -951,9 +949,7 @@ static int ili210x_i2c_probe(struct i2c_client *client, if (error) return error; - usleep_range(50, 100); - gpiod_set_value_cansleep(reset_gpio, 0); - msleep(100); + ili210x_hardware_reset(reset_gpio); } priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c index 180d7e9d3400..81c55bfd6e0c 100644 --- a/drivers/mmc/core/mmc_ops.c +++ b/drivers/mmc/core/mmc_ops.c @@ -21,7 +21,7 @@ #define MMC_BKOPS_TIMEOUT_MS (120 * 1000) /* 120s */ #define MMC_SANITIZE_TIMEOUT_MS (240 * 1000) /* 240s */ -#define MMC_OP_COND_PERIOD_US (1 * 1000) /* 1ms */ +#define MMC_OP_COND_PERIOD_US (4 * 1000) /* 4ms */ #define MMC_OP_COND_TIMEOUT_MS 1000 /* 1s */ static const u8 tuning_blk_pattern_4bit[] = { diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index b3b5bc1c803b..088bb1bcf1ef 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1495,34 +1495,22 @@ static int m_can_dev_setup(struct m_can_classdev *cdev) err = can_set_static_ctrlmode(dev, CAN_CTRLMODE_FD_NON_ISO); if (err) return err; - cdev->can.bittiming_const = cdev->bit_timing ? - cdev->bit_timing : &m_can_bittiming_const_30X; - - cdev->can.data_bittiming_const = cdev->data_timing ? - cdev->data_timing : - &m_can_data_bittiming_const_30X; + cdev->can.bittiming_const = &m_can_bittiming_const_30X; + cdev->can.data_bittiming_const = &m_can_data_bittiming_const_30X; break; case 31: /* CAN_CTRLMODE_FD_NON_ISO is fixed with M_CAN IP v3.1.x */ err = can_set_static_ctrlmode(dev, CAN_CTRLMODE_FD_NON_ISO); if (err) return err; - cdev->can.bittiming_const = cdev->bit_timing ? - cdev->bit_timing : &m_can_bittiming_const_31X; - - cdev->can.data_bittiming_const = cdev->data_timing ? - cdev->data_timing : - &m_can_data_bittiming_const_31X; + cdev->can.bittiming_const = &m_can_bittiming_const_31X; + cdev->can.data_bittiming_const = &m_can_data_bittiming_const_31X; break; case 32: case 33: /* Support both MCAN version v3.2.x and v3.3.0 */ - cdev->can.bittiming_const = cdev->bit_timing ? - cdev->bit_timing : &m_can_bittiming_const_31X; - - cdev->can.data_bittiming_const = cdev->data_timing ? - cdev->data_timing : - &m_can_data_bittiming_const_31X; + cdev->can.bittiming_const = &m_can_bittiming_const_31X; + cdev->can.data_bittiming_const = &m_can_data_bittiming_const_31X; cdev->can.ctrlmode_supported |= (m_can_niso_supported(cdev) ? diff --git a/drivers/net/can/m_can/m_can.h b/drivers/net/can/m_can/m_can.h index 2c5d40997168..d18b515e6ccc 100644 --- a/drivers/net/can/m_can/m_can.h +++ b/drivers/net/can/m_can/m_can.h @@ -85,9 +85,6 @@ struct m_can_classdev { struct sk_buff *tx_skb; struct phy *transceiver; - const struct can_bittiming_const *bit_timing; - const struct can_bittiming_const *data_timing; - struct m_can_ops *ops; int version; diff --git a/drivers/net/can/m_can/m_can_pci.c b/drivers/net/can/m_can/m_can_pci.c index b56a54d6c5a9..8f184a852a0a 100644 --- a/drivers/net/can/m_can/m_can_pci.c +++ b/drivers/net/can/m_can/m_can_pci.c @@ -18,14 +18,9 @@ #define M_CAN_PCI_MMIO_BAR 0 +#define M_CAN_CLOCK_FREQ_EHL 200000000 #define CTL_CSR_INT_CTL_OFFSET 0x508 -struct m_can_pci_config { - const struct can_bittiming_const *bit_timing; - const struct can_bittiming_const *data_timing; - unsigned int clock_freq; -}; - struct m_can_pci_priv { struct m_can_classdev cdev; @@ -89,40 +84,9 @@ static struct m_can_ops m_can_pci_ops = { .read_fifo = iomap_read_fifo, }; -static const struct can_bittiming_const m_can_bittiming_const_ehl = { - .name = KBUILD_MODNAME, - .tseg1_min = 2, /* Time segment 1 = prop_seg + phase_seg1 */ - .tseg1_max = 64, - .tseg2_min = 1, /* Time segment 2 = phase_seg2 */ - .tseg2_max = 128, - .sjw_max = 128, - .brp_min = 1, - .brp_max = 512, - .brp_inc = 1, -}; - -static const struct can_bittiming_const m_can_data_bittiming_const_ehl = { - .name = KBUILD_MODNAME, - .tseg1_min = 2, /* Time segment 1 = prop_seg + phase_seg1 */ - .tseg1_max = 16, - .tseg2_min = 1, /* Time segment 2 = phase_seg2 */ - .tseg2_max = 8, - .sjw_max = 4, - .brp_min = 1, - .brp_max = 32, - .brp_inc = 1, -}; - -static const struct m_can_pci_config m_can_pci_ehl = { - .bit_timing = &m_can_bittiming_const_ehl, - .data_timing = &m_can_data_bittiming_const_ehl, - .clock_freq = 200000000, -}; - static int m_can_pci_probe(struct pci_dev *pci, const struct pci_device_id *id) { struct device *dev = &pci->dev; - const struct m_can_pci_config *cfg; struct m_can_classdev *mcan_class; struct m_can_pci_priv *priv; void __iomem *base; @@ -150,8 +114,6 @@ static int m_can_pci_probe(struct pci_dev *pci, const struct pci_device_id *id) if (!mcan_class) return -ENOMEM; - cfg = (const struct m_can_pci_config *)id->driver_data; - priv = cdev_to_priv(mcan_class); priv->base = base; @@ -163,9 +125,7 @@ static int m_can_pci_probe(struct pci_dev *pci, const struct pci_device_id *id) mcan_class->dev = &pci->dev; mcan_class->net->irq = pci_irq_vector(pci, 0); mcan_class->pm_clock_support = 1; - mcan_class->bit_timing = cfg->bit_timing; - mcan_class->data_timing = cfg->data_timing; - mcan_class->can.clock.freq = cfg->clock_freq; + mcan_class->can.clock.freq = id->driver_data; mcan_class->ops = &m_can_pci_ops; pci_set_drvdata(pci, mcan_class); @@ -218,8 +178,8 @@ static SIMPLE_DEV_PM_OPS(m_can_pci_pm_ops, m_can_pci_suspend, m_can_pci_resume); static const struct pci_device_id m_can_pci_id_table[] = { - { PCI_VDEVICE(INTEL, 0x4bc1), (kernel_ulong_t)&m_can_pci_ehl, }, - { PCI_VDEVICE(INTEL, 0x4bc2), (kernel_ulong_t)&m_can_pci_ehl, }, + { PCI_VDEVICE(INTEL, 0x4bc1), M_CAN_CLOCK_FREQ_EHL, }, + { PCI_VDEVICE(INTEL, 0x4bc2), M_CAN_CLOCK_FREQ_EHL, }, { } /* Terminating Entry */ }; MODULE_DEVICE_TABLE(pci, m_can_pci_id_table); diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 60dde29974bf..df51be3cbe06 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -2585,8 +2585,10 @@ static int bcm_sysport_probe(struct platform_device *pdev) device_set_wakeup_capable(&pdev->dev, 1); priv->wol_clk = devm_clk_get_optional(&pdev->dev, "sw_sysportwol"); - if (IS_ERR(priv->wol_clk)) - return PTR_ERR(priv->wol_clk); + if (IS_ERR(priv->wol_clk)) { + ret = PTR_ERR(priv->wol_clk); + goto err_deregister_fixed_link; + } /* Set the needed headroom once and for all */ BUILD_BUG_ON(sizeof(struct bcm_tsb) != 8); diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index e475be29845c..61284baa0496 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1219,7 +1219,6 @@ static void gem_rx_refill(struct macb_queue *queue) /* Make hw descriptor updates visible to CPU */ rmb(); - queue->rx_prepared_head++; desc = macb_rx_desc(queue, entry); if (!queue->rx_skbuff[entry]) { @@ -1258,6 +1257,7 @@ static void gem_rx_refill(struct macb_queue *queue) dma_wmb(); desc->addr &= ~MACB_BIT(RX_USED); } + queue->rx_prepared_head++; } /* Make descriptor updates visible to hardware */ diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index caf48023f8ea..5231818943c6 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -1928,6 +1928,11 @@ static int ftgmac100_probe(struct platform_device *pdev) /* AST2400 doesn't have working HW checksum generation */ if (np && (of_device_is_compatible(np, "aspeed,ast2400-mac"))) netdev->hw_features &= ~NETIF_F_HW_CSUM; + + /* AST2600 tx checksum with NCSI is broken */ + if (priv->use_ncsi && of_device_is_compatible(np, "aspeed,ast2600-mac")) + netdev->hw_features &= ~NETIF_F_HW_CSUM; + if (np && of_get_property(np, "no-hw-checksum", NULL)) netdev->hw_features &= ~(NETIF_F_HW_CSUM | NETIF_F_RXCSUM); netdev->features |= netdev->hw_features; diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 6d19c58ccacd..454e01ae09b9 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -3043,8 +3043,8 @@ ice_vsi_rebuild_get_coalesce(struct ice_vsi *vsi, ice_for_each_q_vector(vsi, i) { struct ice_q_vector *q_vector = vsi->q_vectors[i]; - coalesce[i].itr_tx = q_vector->tx.itr_setting; - coalesce[i].itr_rx = q_vector->rx.itr_setting; + coalesce[i].itr_tx = q_vector->tx.itr_settings; + coalesce[i].itr_rx = q_vector->rx.itr_settings; coalesce[i].intrl = q_vector->intrl; if (i < vsi->num_txq) @@ -3100,21 +3100,21 @@ ice_vsi_rebuild_set_coalesce(struct ice_vsi *vsi, */ if (i < vsi->alloc_rxq && coalesce[i].rx_valid) { rc = &vsi->q_vectors[i]->rx; - rc->itr_setting = coalesce[i].itr_rx; + rc->itr_settings = coalesce[i].itr_rx; ice_write_itr(rc, rc->itr_setting); } else if (i < vsi->alloc_rxq) { rc = &vsi->q_vectors[i]->rx; - rc->itr_setting = coalesce[0].itr_rx; + rc->itr_settings = coalesce[0].itr_rx; ice_write_itr(rc, rc->itr_setting); } if (i < vsi->alloc_txq && coalesce[i].tx_valid) { rc = &vsi->q_vectors[i]->tx; - rc->itr_setting = coalesce[i].itr_tx; + rc->itr_settings = coalesce[i].itr_tx; ice_write_itr(rc, rc->itr_setting); } else if (i < vsi->alloc_txq) { rc = &vsi->q_vectors[i]->tx; - rc->itr_setting = coalesce[0].itr_tx; + rc->itr_settings = coalesce[0].itr_tx; ice_write_itr(rc, rc->itr_setting); } @@ -3128,12 +3128,12 @@ ice_vsi_rebuild_set_coalesce(struct ice_vsi *vsi, for (; i < vsi->num_q_vectors; i++) { /* transmit */ rc = &vsi->q_vectors[i]->tx; - rc->itr_setting = coalesce[0].itr_tx; + rc->itr_settings = coalesce[0].itr_tx; ice_write_itr(rc, rc->itr_setting); /* receive */ rc = &vsi->q_vectors[i]->rx; - rc->itr_setting = coalesce[0].itr_rx; + rc->itr_settings = coalesce[0].itr_rx; ice_write_itr(rc, rc->itr_setting); vsi->q_vectors[i]->intrl = coalesce[0].intrl; diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 949669fed7d6..963a5f40e071 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -6172,9 +6172,10 @@ static int ice_up_complete(struct ice_vsi *vsi) ice_ptp_link_change(pf, pf->hw.pf_id, true); } - /* clear this now, and the first stats read will be used as baseline */ - vsi->stat_offsets_loaded = false; - + /* Perform an initial read of the statistics registers now to + * set the baseline so counters are ready when interface is up + */ + ice_update_eth_stats(vsi); ice_service_task_schedule(pf); return 0; diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index da025c204577..662947c882e8 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -500,12 +500,19 @@ ice_ptp_read_src_clk_reg(struct ice_pf *pf, struct ptp_system_timestamp *sts) * This function must be called periodically to ensure that the cached value * is never more than 2 seconds old. It must also be called whenever the PHC * time has been changed. + * + * Return: + * * 0 - OK, successfully updated + * * -EAGAIN - PF was busy, need to reschedule the update */ -static void ice_ptp_update_cached_phctime(struct ice_pf *pf) +static int ice_ptp_update_cached_phctime(struct ice_pf *pf) { u64 systime; int i; + if (test_and_set_bit(ICE_CFG_BUSY, pf->state)) + return -EAGAIN; + /* Read the current PHC time */ systime = ice_ptp_read_src_clk_reg(pf, NULL); @@ -528,6 +535,9 @@ static void ice_ptp_update_cached_phctime(struct ice_pf *pf) WRITE_ONCE(vsi->rx_rings[j]->cached_phctime, systime); } } + clear_bit(ICE_CFG_BUSY, pf->state); + + return 0; } /** @@ -2330,17 +2340,18 @@ static void ice_ptp_periodic_work(struct kthread_work *work) { struct ice_ptp *ptp = container_of(work, struct ice_ptp, work.work); struct ice_pf *pf = container_of(ptp, struct ice_pf, ptp); + int err; if (!test_bit(ICE_FLAG_PTP, pf->flags)) return; - ice_ptp_update_cached_phctime(pf); + err = ice_ptp_update_cached_phctime(pf); ice_ptp_tx_tstamp_cleanup(&pf->hw, &pf->ptp.port.tx); - /* Run twice a second */ + /* Run twice a second or reschedule if phc update failed */ kthread_queue_delayed_work(ptp->kworker, &ptp->work, - msecs_to_jiffies(500)); + msecs_to_jiffies(err ? 10 : 500)); } /** diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index cead3eb149bd..ffb3f6a589da 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -384,9 +384,14 @@ struct ice_ring_container { /* this matches the maximum number of ITR bits, but in usec * values, so it is shifted left one bit (bit zero is ignored) */ - u16 itr_setting:13; - u16 itr_reserved:2; - u16 itr_mode:1; + union { + struct { + u16 itr_setting:13; + u16 itr_reserved:2; + u16 itr_mode:1; + }; + u16 itr_settings; + }; enum ice_container_type type; }; diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 34b33b21e0dc..68be2976f539 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -5505,7 +5505,8 @@ static void igb_watchdog_task(struct work_struct *work) break; } - if (adapter->link_speed != SPEED_1000) + if (adapter->link_speed != SPEED_1000 || + !hw->phy.ops.read_reg) goto no_wait; /* wait for Remote receiver status OK */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c index 59988e24b704..bec9ed0103a9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c @@ -23,7 +23,7 @@ struct mlx5_ct_fs_smfs_matcher { }; struct mlx5_ct_fs_smfs_matchers { - struct mlx5_ct_fs_smfs_matcher smfs_matchers[4]; + struct mlx5_ct_fs_smfs_matcher smfs_matchers[6]; struct list_head used; }; @@ -44,7 +44,8 @@ struct mlx5_ct_fs_smfs_rule { }; static inline void -mlx5_ct_fs_smfs_fill_mask(struct mlx5_ct_fs *fs, struct mlx5_flow_spec *spec, bool ipv4, bool tcp) +mlx5_ct_fs_smfs_fill_mask(struct mlx5_ct_fs *fs, struct mlx5_flow_spec *spec, bool ipv4, bool tcp, + bool gre) { void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers); @@ -77,7 +78,7 @@ mlx5_ct_fs_smfs_fill_mask(struct mlx5_ct_fs *fs, struct mlx5_flow_spec *spec, bo MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, tcp_dport); MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_flags, ntohs(MLX5_CT_TCP_FLAGS_MASK)); - } else { + } else if (!gre) { MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, udp_sport); MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, udp_dport); } @@ -87,7 +88,7 @@ mlx5_ct_fs_smfs_fill_mask(struct mlx5_ct_fs *fs, struct mlx5_flow_spec *spec, bo static struct mlx5dr_matcher * mlx5_ct_fs_smfs_matcher_create(struct mlx5_ct_fs *fs, struct mlx5dr_table *tbl, bool ipv4, - bool tcp, u32 priority) + bool tcp, bool gre, u32 priority) { struct mlx5dr_matcher *dr_matcher; struct mlx5_flow_spec *spec; @@ -96,7 +97,7 @@ mlx5_ct_fs_smfs_matcher_create(struct mlx5_ct_fs *fs, struct mlx5dr_table *tbl, if (!spec) return ERR_PTR(-ENOMEM); - mlx5_ct_fs_smfs_fill_mask(fs, spec, ipv4, tcp); + mlx5_ct_fs_smfs_fill_mask(fs, spec, ipv4, tcp, gre); spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2 | MLX5_MATCH_OUTER_HEADERS; dr_matcher = mlx5_smfs_matcher_create(tbl, priority, spec); @@ -108,7 +109,7 @@ mlx5_ct_fs_smfs_matcher_create(struct mlx5_ct_fs *fs, struct mlx5dr_table *tbl, } static struct mlx5_ct_fs_smfs_matcher * -mlx5_ct_fs_smfs_matcher_get(struct mlx5_ct_fs *fs, bool nat, bool ipv4, bool tcp) +mlx5_ct_fs_smfs_matcher_get(struct mlx5_ct_fs *fs, bool nat, bool ipv4, bool tcp, bool gre) { struct mlx5_ct_fs_smfs *fs_smfs = mlx5_ct_fs_priv(fs); struct mlx5_ct_fs_smfs_matcher *m, *smfs_matcher; @@ -119,7 +120,7 @@ mlx5_ct_fs_smfs_matcher_get(struct mlx5_ct_fs *fs, bool nat, bool ipv4, bool tcp int prio; matchers = nat ? &fs_smfs->matchers_nat : &fs_smfs->matchers; - smfs_matcher = &matchers->smfs_matchers[ipv4 * 2 + tcp]; + smfs_matcher = &matchers->smfs_matchers[ipv4 * 3 + tcp * 2 + gre]; if (refcount_inc_not_zero(&smfs_matcher->ref)) return smfs_matcher; @@ -145,11 +146,11 @@ mlx5_ct_fs_smfs_matcher_get(struct mlx5_ct_fs *fs, bool nat, bool ipv4, bool tcp } tbl = nat ? fs_smfs->ct_nat_tbl : fs_smfs->ct_tbl; - dr_matcher = mlx5_ct_fs_smfs_matcher_create(fs, tbl, ipv4, tcp, prio); + dr_matcher = mlx5_ct_fs_smfs_matcher_create(fs, tbl, ipv4, tcp, gre, prio); if (IS_ERR(dr_matcher)) { netdev_warn(fs->netdev, - "ct_fs_smfs: failed to create matcher (nat %d, ipv4 %d, tcp %d), err: %ld\n", - nat, ipv4, tcp, PTR_ERR(dr_matcher)); + "ct_fs_smfs: failed to create matcher (nat %d, ipv4 %d, tcp %d, gre %d), err: %ld\n", + nat, ipv4, tcp, gre, PTR_ERR(dr_matcher)); smfs_matcher = ERR_CAST(dr_matcher); goto out_unlock; @@ -222,16 +223,17 @@ mlx5_ct_fs_smfs_destroy(struct mlx5_ct_fs *fs) static inline bool mlx5_tc_ct_valid_used_dissector_keys(const u32 used_keys) { -#define DISSECTOR_BIT(name) BIT(FLOW_DISSECTOR_KEY_ ## name) - const u32 basic_keys = DISSECTOR_BIT(BASIC) | DISSECTOR_BIT(CONTROL) | - DISSECTOR_BIT(PORTS) | DISSECTOR_BIT(META); - const u32 ipv4_tcp = basic_keys | DISSECTOR_BIT(IPV4_ADDRS) | DISSECTOR_BIT(TCP); - const u32 ipv4_udp = basic_keys | DISSECTOR_BIT(IPV4_ADDRS); - const u32 ipv6_tcp = basic_keys | DISSECTOR_BIT(IPV6_ADDRS) | DISSECTOR_BIT(TCP); - const u32 ipv6_udp = basic_keys | DISSECTOR_BIT(IPV6_ADDRS); +#define DISS_BIT(name) BIT(FLOW_DISSECTOR_KEY_ ## name) + const u32 basic_keys = DISS_BIT(BASIC) | DISS_BIT(CONTROL) | DISS_BIT(META); + const u32 ipv4_tcp = basic_keys | DISS_BIT(IPV4_ADDRS) | DISS_BIT(PORTS) | DISS_BIT(TCP); + const u32 ipv6_tcp = basic_keys | DISS_BIT(IPV6_ADDRS) | DISS_BIT(PORTS) | DISS_BIT(TCP); + const u32 ipv4_udp = basic_keys | DISS_BIT(IPV4_ADDRS) | DISS_BIT(PORTS); + const u32 ipv6_udp = basic_keys | DISS_BIT(IPV6_ADDRS) | DISS_BIT(PORTS); + const u32 ipv4_gre = basic_keys | DISS_BIT(IPV4_ADDRS); + const u32 ipv6_gre = basic_keys | DISS_BIT(IPV6_ADDRS); return (used_keys == ipv4_tcp || used_keys == ipv4_udp || used_keys == ipv6_tcp || - used_keys == ipv6_udp); + used_keys == ipv6_udp || used_keys == ipv4_gre || used_keys == ipv6_gre); } static bool @@ -254,20 +256,24 @@ mlx5_ct_fs_smfs_ct_validate_flow_rule(struct mlx5_ct_fs *fs, struct flow_rule *f flow_rule_match_control(flow_rule, &control); flow_rule_match_ipv4_addrs(flow_rule, &ipv4_addrs); flow_rule_match_ipv6_addrs(flow_rule, &ipv6_addrs); - flow_rule_match_ports(flow_rule, &ports); - flow_rule_match_tcp(flow_rule, &tcp); + if (basic.key->ip_proto != IPPROTO_GRE) + flow_rule_match_ports(flow_rule, &ports); + if (basic.key->ip_proto == IPPROTO_TCP) + flow_rule_match_tcp(flow_rule, &tcp); if (basic.mask->n_proto != htons(0xFFFF) || (basic.key->n_proto != htons(ETH_P_IP) && basic.key->n_proto != htons(ETH_P_IPV6)) || basic.mask->ip_proto != 0xFF || - (basic.key->ip_proto != IPPROTO_UDP && basic.key->ip_proto != IPPROTO_TCP)) { + (basic.key->ip_proto != IPPROTO_UDP && basic.key->ip_proto != IPPROTO_TCP && + basic.key->ip_proto != IPPROTO_GRE)) { ct_dbg("rule uses unexpected basic match (n_proto 0x%04x/0x%04x, ip_proto 0x%02x/0x%02x)", ntohs(basic.key->n_proto), ntohs(basic.mask->n_proto), basic.key->ip_proto, basic.mask->ip_proto); return false; } - if (ports.mask->src != htons(0xFFFF) || ports.mask->dst != htons(0xFFFF)) { + if (basic.key->ip_proto != IPPROTO_GRE && + (ports.mask->src != htons(0xFFFF) || ports.mask->dst != htons(0xFFFF))) { ct_dbg("rule uses ports match (src 0x%04x, dst 0x%04x)", ports.mask->src, ports.mask->dst); return false; @@ -291,7 +297,7 @@ mlx5_ct_fs_smfs_ct_rule_add(struct mlx5_ct_fs *fs, struct mlx5_flow_spec *spec, struct mlx5dr_action *actions[5]; struct mlx5dr_rule *rule; int num_actions = 0, err; - bool nat, tcp, ipv4; + bool nat, tcp, ipv4, gre; if (!mlx5_ct_fs_smfs_ct_validate_flow_rule(fs, flow_rule)) return ERR_PTR(-EOPNOTSUPP); @@ -314,15 +320,17 @@ mlx5_ct_fs_smfs_ct_rule_add(struct mlx5_ct_fs *fs, struct mlx5_flow_spec *spec, ipv4 = mlx5e_tc_get_ip_version(spec, true) == 4; tcp = MLX5_GET(fte_match_param, spec->match_value, outer_headers.ip_protocol) == IPPROTO_TCP; + gre = MLX5_GET(fte_match_param, spec->match_value, + outer_headers.ip_protocol) == IPPROTO_GRE; - smfs_matcher = mlx5_ct_fs_smfs_matcher_get(fs, nat, ipv4, tcp); + smfs_matcher = mlx5_ct_fs_smfs_matcher_get(fs, nat, ipv4, tcp, gre); if (IS_ERR(smfs_matcher)) { err = PTR_ERR(smfs_matcher); goto err_matcher; } rule = mlx5_smfs_rule_create(smfs_matcher->dr_matcher, spec, num_actions, actions, - MLX5_FLOW_CONTEXT_FLOW_SOURCE_ANY_VPORT); + spec->flow_context.flow_source); if (!rule) { err = -EINVAL; goto err_create; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c index a55b066746cb..857840ab1e91 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c @@ -14,19 +14,26 @@ static int mlx5e_trap_napi_poll(struct napi_struct *napi, int budget) bool busy = false; int work_done = 0; + rcu_read_lock(); + ch_stats->poll++; work_done = mlx5e_poll_rx_cq(&rq->cq, budget); busy |= work_done == budget; busy |= rq->post_wqes(rq); - if (busy) - return budget; + if (busy) { + work_done = budget; + goto out; + } if (unlikely(!napi_complete_done(napi, work_done))) - return work_done; + goto out; mlx5e_cq_arm(&rq->cq); + +out: + rcu_read_unlock(); return work_done; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 2f1dedc721d1..fa229998606c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3864,6 +3864,10 @@ static netdev_features_t mlx5e_fix_uplink_rep_features(struct net_device *netdev if (netdev->features & NETIF_F_NTUPLE) netdev_warn(netdev, "Disabling ntuple, not supported in switchdev mode\n"); + features &= ~NETIF_F_GRO_HW; + if (netdev->features & NETIF_F_GRO_HW) + netdev_warn(netdev, "Disabling HW_GRO, not supported in switchdev mode\n"); + return features; } @@ -3896,6 +3900,25 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev, } } + if (params->xdp_prog) { + if (features & NETIF_F_LRO) { + netdev_warn(netdev, "LRO is incompatible with XDP\n"); + features &= ~NETIF_F_LRO; + } + if (features & NETIF_F_GRO_HW) { + netdev_warn(netdev, "HW GRO is incompatible with XDP\n"); + features &= ~NETIF_F_GRO_HW; + } + } + + if (priv->xsk.refcnt) { + if (features & NETIF_F_GRO_HW) { + netdev_warn(netdev, "HW GRO is incompatible with AF_XDP (%u XSKs are active)\n", + priv->xsk.refcnt); + features &= ~NETIF_F_GRO_HW; + } + } + if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS)) { features &= ~NETIF_F_RXHASH; if (netdev->features & NETIF_F_RXHASH) @@ -4850,10 +4873,6 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER; netdev->hw_features |= NETIF_F_HW_VLAN_STAG_TX; - if (!!MLX5_CAP_GEN(mdev, shampo) && - mlx5e_check_fragmented_striding_rq_cap(mdev)) - netdev->hw_features |= NETIF_F_GRO_HW; - if (mlx5e_tunnel_any_tx_proto_supported(mdev)) { netdev->hw_enc_features |= NETIF_F_HW_CSUM; netdev->hw_enc_features |= NETIF_F_TSO; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 816d991f7621..3ad67e6b5586 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -2663,28 +2663,6 @@ static void cleanup_root_ns(struct mlx5_flow_root_namespace *root_ns) clean_tree(&root_ns->ns.node); } -void mlx5_cleanup_fs(struct mlx5_core_dev *dev) -{ - struct mlx5_flow_steering *steering = dev->priv.steering; - - cleanup_root_ns(steering->root_ns); - cleanup_root_ns(steering->fdb_root_ns); - steering->fdb_root_ns = NULL; - kfree(steering->fdb_sub_ns); - steering->fdb_sub_ns = NULL; - cleanup_root_ns(steering->port_sel_root_ns); - cleanup_root_ns(steering->sniffer_rx_root_ns); - cleanup_root_ns(steering->sniffer_tx_root_ns); - cleanup_root_ns(steering->rdma_rx_root_ns); - cleanup_root_ns(steering->rdma_tx_root_ns); - cleanup_root_ns(steering->egress_root_ns); - mlx5_cleanup_fc_stats(dev); - kmem_cache_destroy(steering->ftes_cache); - kmem_cache_destroy(steering->fgs_cache); - mlx5_ft_pool_destroy(dev); - kfree(steering); -} - static int init_sniffer_tx_root_ns(struct mlx5_flow_steering *steering) { struct fs_prio *prio; @@ -3086,42 +3064,27 @@ cleanup: return err; } -int mlx5_init_fs(struct mlx5_core_dev *dev) +void mlx5_fs_core_cleanup(struct mlx5_core_dev *dev) { - struct mlx5_flow_steering *steering; - int err = 0; - - err = mlx5_init_fc_stats(dev); - if (err) - return err; - - err = mlx5_ft_pool_init(dev); - if (err) - return err; - - steering = kzalloc(sizeof(*steering), GFP_KERNEL); - if (!steering) { - err = -ENOMEM; - goto err; - } - - steering->dev = dev; - dev->priv.steering = steering; + struct mlx5_flow_steering *steering = dev->priv.steering; - if (mlx5_fs_dr_is_supported(dev)) - steering->mode = MLX5_FLOW_STEERING_MODE_SMFS; - else - steering->mode = MLX5_FLOW_STEERING_MODE_DMFS; + cleanup_root_ns(steering->root_ns); + cleanup_root_ns(steering->fdb_root_ns); + steering->fdb_root_ns = NULL; + kfree(steering->fdb_sub_ns); + steering->fdb_sub_ns = NULL; + cleanup_root_ns(steering->port_sel_root_ns); + cleanup_root_ns(steering->sniffer_rx_root_ns); + cleanup_root_ns(steering->sniffer_tx_root_ns); + cleanup_root_ns(steering->rdma_rx_root_ns); + cleanup_root_ns(steering->rdma_tx_root_ns); + cleanup_root_ns(steering->egress_root_ns); +} - steering->fgs_cache = kmem_cache_create("mlx5_fs_fgs", - sizeof(struct mlx5_flow_group), 0, - 0, NULL); - steering->ftes_cache = kmem_cache_create("mlx5_fs_ftes", sizeof(struct fs_fte), 0, - 0, NULL); - if (!steering->ftes_cache || !steering->fgs_cache) { - err = -ENOMEM; - goto err; - } +int mlx5_fs_core_init(struct mlx5_core_dev *dev) +{ + struct mlx5_flow_steering *steering = dev->priv.steering; + int err = 0; if ((((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && (MLX5_CAP_GEN(dev, nic_flow_table))) || @@ -3180,8 +3143,64 @@ int mlx5_init_fs(struct mlx5_core_dev *dev) } return 0; + +err: + mlx5_fs_core_cleanup(dev); + return err; +} + +void mlx5_fs_core_free(struct mlx5_core_dev *dev) +{ + struct mlx5_flow_steering *steering = dev->priv.steering; + + kmem_cache_destroy(steering->ftes_cache); + kmem_cache_destroy(steering->fgs_cache); + kfree(steering); + mlx5_ft_pool_destroy(dev); + mlx5_cleanup_fc_stats(dev); +} + +int mlx5_fs_core_alloc(struct mlx5_core_dev *dev) +{ + struct mlx5_flow_steering *steering; + int err = 0; + + err = mlx5_init_fc_stats(dev); + if (err) + return err; + + err = mlx5_ft_pool_init(dev); + if (err) + goto err; + + steering = kzalloc(sizeof(*steering), GFP_KERNEL); + if (!steering) { + err = -ENOMEM; + goto err; + } + + steering->dev = dev; + dev->priv.steering = steering; + + if (mlx5_fs_dr_is_supported(dev)) + steering->mode = MLX5_FLOW_STEERING_MODE_SMFS; + else + steering->mode = MLX5_FLOW_STEERING_MODE_DMFS; + + steering->fgs_cache = kmem_cache_create("mlx5_fs_fgs", + sizeof(struct mlx5_flow_group), 0, + 0, NULL); + steering->ftes_cache = kmem_cache_create("mlx5_fs_ftes", sizeof(struct fs_fte), 0, + 0, NULL); + if (!steering->ftes_cache || !steering->fgs_cache) { + err = -ENOMEM; + goto err; + } + + return 0; + err: - mlx5_cleanup_fs(dev); + mlx5_fs_core_free(dev); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index c488a7c5b07e..3f20523e514f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -298,8 +298,10 @@ int mlx5_flow_namespace_set_peer(struct mlx5_flow_root_namespace *ns, int mlx5_flow_namespace_set_mode(struct mlx5_flow_namespace *ns, enum mlx5_flow_steering_mode mode); -int mlx5_init_fs(struct mlx5_core_dev *dev); -void mlx5_cleanup_fs(struct mlx5_core_dev *dev); +int mlx5_fs_core_alloc(struct mlx5_core_dev *dev); +void mlx5_fs_core_free(struct mlx5_core_dev *dev); +int mlx5_fs_core_init(struct mlx5_core_dev *dev); +void mlx5_fs_core_cleanup(struct mlx5_core_dev *dev); int mlx5_fs_egress_acls_init(struct mlx5_core_dev *dev, int total_vports); void mlx5_fs_egress_acls_cleanup(struct mlx5_core_dev *dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c index ca1aba845dd6..81eb67fb95b0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c @@ -8,7 +8,8 @@ enum { MLX5_FW_RESET_FLAGS_RESET_REQUESTED, MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, - MLX5_FW_RESET_FLAGS_PENDING_COMP + MLX5_FW_RESET_FLAGS_PENDING_COMP, + MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS }; struct mlx5_fw_reset { @@ -208,7 +209,10 @@ static void poll_sync_reset(struct timer_list *t) if (fatal_error) { mlx5_core_warn(dev, "Got Device Reset\n"); - queue_work(fw_reset->wq, &fw_reset->reset_reload_work); + if (!test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags)) + queue_work(fw_reset->wq, &fw_reset->reset_reload_work); + else + mlx5_core_err(dev, "Device is being removed, Drop new reset work\n"); return; } @@ -433,9 +437,12 @@ static int fw_reset_event_notifier(struct notifier_block *nb, unsigned long acti struct mlx5_fw_reset *fw_reset = mlx5_nb_cof(nb, struct mlx5_fw_reset, nb); struct mlx5_eqe *eqe = data; + if (test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags)) + return NOTIFY_DONE; + switch (eqe->sub_type) { case MLX5_GENERAL_SUBTYPE_FW_LIVE_PATCH_EVENT: - queue_work(fw_reset->wq, &fw_reset->fw_live_patch_work); + queue_work(fw_reset->wq, &fw_reset->fw_live_patch_work); break; case MLX5_GENERAL_SUBTYPE_PCI_SYNC_FOR_FW_UPDATE_EVENT: mlx5_sync_reset_events_handle(fw_reset, eqe); @@ -479,6 +486,18 @@ void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev) mlx5_eq_notifier_unregister(dev, &dev->priv.fw_reset->nb); } +void mlx5_drain_fw_reset(struct mlx5_core_dev *dev) +{ + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + + set_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags); + cancel_work_sync(&fw_reset->fw_live_patch_work); + cancel_work_sync(&fw_reset->reset_request_work); + cancel_work_sync(&fw_reset->reset_reload_work); + cancel_work_sync(&fw_reset->reset_now_work); + cancel_work_sync(&fw_reset->reset_abort_work); +} + int mlx5_fw_reset_init(struct mlx5_core_dev *dev) { struct mlx5_fw_reset *fw_reset = kzalloc(sizeof(*fw_reset), GFP_KERNEL); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h index 694fc7cb2684..dc141c7e641a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h @@ -16,6 +16,7 @@ int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev); int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev); void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev); void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev); +void mlx5_drain_fw_reset(struct mlx5_core_dev *dev); int mlx5_fw_reset_init(struct mlx5_core_dev *dev); void mlx5_fw_reset_cleanup(struct mlx5_core_dev *dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 2589e39eb9c7..ef196cb764e2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -938,6 +938,12 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) goto err_sf_table_cleanup; } + err = mlx5_fs_core_alloc(dev); + if (err) { + mlx5_core_err(dev, "Failed to alloc flow steering\n"); + goto err_fs; + } + dev->dm = mlx5_dm_create(dev); if (IS_ERR(dev->dm)) mlx5_core_warn(dev, "Failed to init device memory%d\n", err); @@ -948,6 +954,8 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) return 0; +err_fs: + mlx5_sf_table_cleanup(dev); err_sf_table_cleanup: mlx5_sf_hw_table_cleanup(dev); err_sf_hw_table_cleanup: @@ -985,6 +993,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev) mlx5_hv_vhca_destroy(dev->hv_vhca); mlx5_fw_tracer_destroy(dev->tracer); mlx5_dm_cleanup(dev); + mlx5_fs_core_free(dev); mlx5_sf_table_cleanup(dev); mlx5_sf_hw_table_cleanup(dev); mlx5_vhca_event_cleanup(dev); @@ -1191,7 +1200,7 @@ static int mlx5_load(struct mlx5_core_dev *dev) goto err_tls_start; } - err = mlx5_init_fs(dev); + err = mlx5_fs_core_init(dev); if (err) { mlx5_core_err(dev, "Failed to init flow steering\n"); goto err_fs; @@ -1236,7 +1245,7 @@ err_ec: err_vhca: mlx5_vhca_event_stop(dev); err_set_hca: - mlx5_cleanup_fs(dev); + mlx5_fs_core_cleanup(dev); err_fs: mlx5_accel_tls_cleanup(dev); err_tls_start: @@ -1265,7 +1274,7 @@ static void mlx5_unload(struct mlx5_core_dev *dev) mlx5_ec_cleanup(dev); mlx5_sf_hw_table_destroy(dev); mlx5_vhca_event_stop(dev); - mlx5_cleanup_fs(dev); + mlx5_fs_core_cleanup(dev); mlx5_accel_ipsec_cleanup(dev); mlx5_accel_tls_cleanup(dev); mlx5_fpga_device_stop(dev); @@ -1618,6 +1627,10 @@ static void remove_one(struct pci_dev *pdev) struct mlx5_core_dev *dev = pci_get_drvdata(pdev); struct devlink *devlink = priv_to_devlink(dev); + /* mlx5_drain_fw_reset() is using devlink APIs. Hence, we must drain + * fw_reset before unregistering the devlink. + */ + mlx5_drain_fw_reset(dev); devlink_unregister(devlink); mlx5_sriov_disable(pdev); mlx5_crdump_disable(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c index 850937cd8bf9..1383550f44c1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c @@ -530,6 +530,37 @@ static int dr_action_handle_cs_recalc(struct mlx5dr_domain *dmn, return 0; } +static void dr_action_modify_ttl_adjust(struct mlx5dr_domain *dmn, + struct mlx5dr_ste_actions_attr *attr, + bool rx_rule, + bool *recalc_cs_required) +{ + *recalc_cs_required = false; + + /* if device supports csum recalculation - no adjustment needed */ + if (mlx5dr_ste_supp_ttl_cs_recalc(&dmn->info.caps)) + return; + + /* no adjustment needed on TX rules */ + if (!rx_rule) + return; + + if (!MLX5_CAP_ESW_FLOWTABLE(dmn->mdev, fdb_ipv4_ttl_modify)) { + /* Ignore the modify TTL action. + * It is always kept as last HW action. + */ + attr->modify_actions--; + return; + } + + if (dmn->type == MLX5DR_DOMAIN_TYPE_FDB) + /* Due to a HW bug on some devices, modifying TTL on RX flows + * will cause an incorrect checksum calculation. In such cases + * we will use a FW table to recalculate the checksum. + */ + *recalc_cs_required = true; +} + static void dr_action_print_sequence(struct mlx5dr_domain *dmn, struct mlx5dr_action *actions[], int last_idx) @@ -650,8 +681,9 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher, case DR_ACTION_TYP_MODIFY_HDR: attr.modify_index = action->rewrite->index; attr.modify_actions = action->rewrite->num_of_actions; - recalc_cs_required = action->rewrite->modify_ttl && - !mlx5dr_ste_supp_ttl_cs_recalc(&dmn->info.caps); + if (action->rewrite->modify_ttl) + dr_action_modify_ttl_adjust(dmn, &attr, rx_rule, + &recalc_cs_required); break; case DR_ACTION_TYP_L2_TO_TNL_L2: case DR_ACTION_TYP_L2_TO_TNL_L3: @@ -732,12 +764,7 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher, *new_hw_ste_arr_sz = nic_matcher->num_of_builders; last_ste = ste_arr + DR_STE_SIZE * (nic_matcher->num_of_builders - 1); - /* Due to a HW bug in some devices, modifying TTL on RX flows will - * cause an incorrect checksum calculation. In this case we will - * use a FW table to recalculate. - */ - if (dmn->type == MLX5DR_DOMAIN_TYPE_FDB && - rx_rule && recalc_cs_required && dest_action) { + if (recalc_cs_required && dest_action) { ret = dr_action_handle_cs_recalc(dmn, dest_action, &attr.final_icm_addr); if (ret) { mlx5dr_err(dmn, @@ -842,7 +869,8 @@ struct mlx5dr_action * mlx5dr_action_create_mult_dest_tbl(struct mlx5dr_domain *dmn, struct mlx5dr_action_dest *dests, u32 num_of_dests, - bool ignore_flow_level) + bool ignore_flow_level, + u32 flow_source) { struct mlx5dr_cmd_flow_destination_hw_info *hw_dests; struct mlx5dr_action **ref_actions; @@ -914,7 +942,8 @@ mlx5dr_action_create_mult_dest_tbl(struct mlx5dr_domain *dmn, reformat_req, &action->dest_tbl->fw_tbl.id, &action->dest_tbl->fw_tbl.group_id, - ignore_flow_level); + ignore_flow_level, + flow_source); if (ret) goto free_action; @@ -1556,12 +1585,6 @@ dr_action_modify_check_is_ttl_modify(const void *sw_action) return sw_field == MLX5_ACTION_IN_FIELD_OUT_IP_TTL; } -static bool dr_action_modify_ttl_ignore(struct mlx5dr_domain *dmn) -{ - return !mlx5dr_ste_supp_ttl_cs_recalc(&dmn->info.caps) && - !MLX5_CAP_ESW_FLOWTABLE(dmn->mdev, fdb_ipv4_ttl_modify); -} - static int dr_actions_convert_modify_header(struct mlx5dr_action *action, u32 max_hw_actions, u32 num_sw_actions, @@ -1573,6 +1596,7 @@ static int dr_actions_convert_modify_header(struct mlx5dr_action *action, const struct mlx5dr_ste_action_modify_field *hw_dst_action_info; const struct mlx5dr_ste_action_modify_field *hw_src_action_info; struct mlx5dr_domain *dmn = action->rewrite->dmn; + __be64 *modify_ttl_sw_action = NULL; int ret, i, hw_idx = 0; __be64 *sw_action; __be64 hw_action; @@ -1585,8 +1609,14 @@ static int dr_actions_convert_modify_header(struct mlx5dr_action *action, action->rewrite->allow_rx = 1; action->rewrite->allow_tx = 1; - for (i = 0; i < num_sw_actions; i++) { - sw_action = &sw_actions[i]; + for (i = 0; i < num_sw_actions || modify_ttl_sw_action; i++) { + /* modify TTL is handled separately, as a last action */ + if (i == num_sw_actions) { + sw_action = modify_ttl_sw_action; + modify_ttl_sw_action = NULL; + } else { + sw_action = &sw_actions[i]; + } ret = dr_action_modify_check_field_limitation(action, sw_action); @@ -1595,10 +1625,9 @@ static int dr_actions_convert_modify_header(struct mlx5dr_action *action, if (!(*modify_ttl) && dr_action_modify_check_is_ttl_modify(sw_action)) { - if (dr_action_modify_ttl_ignore(dmn)) - continue; - + modify_ttl_sw_action = sw_action; *modify_ttl = true; + continue; } /* Convert SW action to HW action */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c index 68a4c32d5f34..f05ef0cd54ba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c @@ -104,7 +104,8 @@ int mlx5dr_fw_create_md_tbl(struct mlx5dr_domain *dmn, bool reformat_req, u32 *tbl_id, u32 *group_id, - bool ignore_flow_level) + bool ignore_flow_level, + u32 flow_source) { struct mlx5dr_cmd_create_flow_table_attr ft_attr = {}; struct mlx5dr_cmd_fte_info fte_info = {}; @@ -139,6 +140,7 @@ int mlx5dr_fw_create_md_tbl(struct mlx5dr_domain *dmn, fte_info.val = val; fte_info.dest_arr = dest; fte_info.ignore_flow_level = ignore_flow_level; + fte_info.flow_context.flow_source = flow_source; ret = mlx5dr_cmd_set_fte(dmn->mdev, 0, 0, &ft_info, *group_id, &fte_info); if (ret) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c index 5a322335f204..2010d4ac6519 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c @@ -420,7 +420,7 @@ dr_ste_v0_set_actions_tx(struct mlx5dr_domain *dmn, * encapsulation. The reason for that is that we support * modify headers for outer headers only */ - if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) { + if (action_type_set[DR_ACTION_TYP_MODIFY_HDR] && attr->modify_actions) { dr_ste_v0_set_entry_type(last_ste, DR_STE_TYPE_MODIFY_PKT); dr_ste_v0_set_rewrite_actions(last_ste, attr->modify_actions, @@ -513,7 +513,7 @@ dr_ste_v0_set_actions_rx(struct mlx5dr_domain *dmn, } } - if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) { + if (action_type_set[DR_ACTION_TYP_MODIFY_HDR] && attr->modify_actions) { if (dr_ste_v0_get_entry_type(last_ste) == DR_STE_TYPE_MODIFY_PKT) dr_ste_v0_arr_init_next(&last_ste, added_stes, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index 46866a5fc5ca..98320e3945ad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -1461,7 +1461,8 @@ int mlx5dr_fw_create_md_tbl(struct mlx5dr_domain *dmn, bool reformat_req, u32 *tbl_id, u32 *group_id, - bool ignore_flow_level); + bool ignore_flow_level, + u32 flow_source); void mlx5dr_fw_destroy_md_tbl(struct mlx5dr_domain *dmn, u32 tbl_id, u32 group_id); #endif /* _DR_TYPES_H_ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c index 045b0cf90063..728f81882589 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c @@ -520,6 +520,7 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, } else if (num_term_actions > 1) { bool ignore_flow_level = !!(fte->action.flags & FLOW_ACT_IGNORE_FLOW_LEVEL); + u32 flow_source = fte->flow_context.flow_source; if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX || fs_dr_num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { @@ -529,7 +530,8 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, tmp_action = mlx5dr_action_create_mult_dest_tbl(domain, term_actions, num_term_actions, - ignore_flow_level); + ignore_flow_level, + flow_source); if (!tmp_action) { err = -EOPNOTSUPP; goto free_actions; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h index ec5cbec0d455..7626c85643b1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h @@ -99,7 +99,8 @@ struct mlx5dr_action * mlx5dr_action_create_mult_dest_tbl(struct mlx5dr_domain *dmn, struct mlx5dr_action_dest *dests, u32 num_of_dests, - bool ignore_flow_level); + bool ignore_flow_level, + u32 flow_source); struct mlx5dr_action *mlx5dr_action_create_drop(void); diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c index 95830e3e2b1f..05f6dcc9dfd5 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c @@ -103,6 +103,24 @@ static int lan966x_create_targets(struct platform_device *pdev, return 0; } +static bool lan966x_port_unique_address(struct net_device *dev) +{ + struct lan966x_port *port = netdev_priv(dev); + struct lan966x *lan966x = port->lan966x; + int p; + + for (p = 0; p < lan966x->num_phys_ports; ++p) { + port = lan966x->ports[p]; + if (!port || port->dev == dev) + continue; + + if (ether_addr_equal(dev->dev_addr, port->dev->dev_addr)) + return false; + } + + return true; +} + static int lan966x_port_set_mac_address(struct net_device *dev, void *p) { struct lan966x_port *port = netdev_priv(dev); @@ -110,16 +128,26 @@ static int lan966x_port_set_mac_address(struct net_device *dev, void *p) const struct sockaddr *addr = p; int ret; + if (ether_addr_equal(addr->sa_data, dev->dev_addr)) + return 0; + /* Learn the new net device MAC address in the mac table. */ ret = lan966x_mac_cpu_learn(lan966x, addr->sa_data, HOST_PVID); if (ret) return ret; + /* If there is another port with the same address as the dev, then don't + * delete it from the MAC table + */ + if (!lan966x_port_unique_address(dev)) + goto out; + /* Then forget the previous one. */ ret = lan966x_mac_cpu_forget(lan966x, dev->dev_addr, HOST_PVID); if (ret) return ret; +out: eth_hw_addr_set(dev, addr->sa_data); return ret; } diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c index b30589a135c2..06f4d9a9e938 100644 --- a/drivers/net/ethernet/qlogic/qla3xxx.c +++ b/drivers/net/ethernet/qlogic/qla3xxx.c @@ -3614,7 +3614,8 @@ static void ql_reset_work(struct work_struct *work) qdev->mem_map_registers; unsigned long hw_flags; - if (test_bit((QL_RESET_PER_SCSI | QL_RESET_START), &qdev->flags)) { + if (test_bit(QL_RESET_PER_SCSI, &qdev->flags) || + test_bit(QL_RESET_START, &qdev->flags)) { clear_bit(QL_LINK_MASTER, &qdev->flags); /* diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c index bc981043cc80..a701178a1d13 100644 --- a/drivers/net/ipa/gsi.c +++ b/drivers/net/ipa/gsi.c @@ -1367,9 +1367,10 @@ static void gsi_evt_ring_rx_update(struct gsi_evt_ring *evt_ring, u32 index) struct gsi_event *event_done; struct gsi_event *event; struct gsi_trans *trans; + u32 trans_count = 0; u32 byte_count = 0; - u32 old_index; u32 event_avail; + u32 old_index; trans_info = &channel->trans_info; @@ -1390,6 +1391,7 @@ static void gsi_evt_ring_rx_update(struct gsi_evt_ring *evt_ring, u32 index) do { trans->len = __le16_to_cpu(event->len); byte_count += trans->len; + trans_count++; /* Move on to the next event and transaction */ if (--event_avail) @@ -1401,7 +1403,7 @@ static void gsi_evt_ring_rx_update(struct gsi_evt_ring *evt_ring, u32 index) /* We record RX bytes when they are received */ channel->byte_count += byte_count; - channel->trans_count++; + channel->trans_count += trans_count; } /* Initialize a ring, including allocating DMA memory for its entries */ diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index 888e94278a84..cea7b2e2ce96 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -1150,13 +1150,12 @@ static void ipa_endpoint_skb_copy(struct ipa_endpoint *endpoint, return; skb = __dev_alloc_skb(len, GFP_ATOMIC); - if (!skb) - return; - - /* Copy the data into the socket buffer and receive it */ - skb_put(skb, len); - memcpy(skb->data, data, len); - skb->truesize += extra; + if (skb) { + /* Copy the data into the socket buffer and receive it */ + skb_put(skb, len); + memcpy(skb->data, data, len); + skb->truesize += extra; + } ipa_modem_skb_rx(endpoint->netdev, skb); } diff --git a/drivers/net/ipa/ipa_qmi.c b/drivers/net/ipa/ipa_qmi.c index 90f3aec55b36..ec010cf2e816 100644 --- a/drivers/net/ipa/ipa_qmi.c +++ b/drivers/net/ipa/ipa_qmi.c @@ -125,7 +125,7 @@ static void ipa_qmi_indication(struct ipa_qmi *ipa_qmi) */ static void ipa_qmi_ready(struct ipa_qmi *ipa_qmi) { - struct ipa *ipa = container_of(ipa_qmi, struct ipa, qmi); + struct ipa *ipa; int ret; /* We aren't ready until the modem and microcontroller are */ diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 3619520340b7..e172743948ed 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -988,6 +988,7 @@ static int pppoe_fill_forward_path(struct net_device_path_ctx *ctx, path->encap.proto = htons(ETH_P_PPP_SES); path->encap.id = be16_to_cpu(po->num); memcpy(path->encap.h_dest, po->pppoe_pa.remote, ETH_ALEN); + memcpy(ctx->daddr, po->pppoe_pa.remote, ETH_ALEN); path->dev = ctx->dev; ctx->dev = dev; diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index d9d90baac72a..93e8d119d45f 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -589,6 +589,7 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32 ring_idx, if (dma_mapping_error(&adapter->pdev->dev, rbi->dma_addr)) { dev_kfree_skb_any(rbi->skb); + rbi->skb = NULL; rq->stats.rx_buf_alloc_failure++; break; } @@ -613,6 +614,7 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32 ring_idx, if (dma_mapping_error(&adapter->pdev->dev, rbi->dma_addr)) { put_page(rbi->page); + rbi->page = NULL; rq->stats.rx_buf_alloc_failure++; break; } @@ -1666,6 +1668,10 @@ vmxnet3_rq_cleanup(struct vmxnet3_rx_queue *rq, u32 i, ring_idx; struct Vmxnet3_RxDesc *rxd; + /* ring has already been cleaned up */ + if (!rq->rx_ring[0].base) + return; + for (ring_idx = 0; ring_idx < 2; ring_idx++) { for (i = 0; i < rq->rx_ring[ring_idx].size; i++) { #ifdef __BIG_ENDIAN_BITFIELD diff --git a/drivers/nfc/pn533/pn533.c b/drivers/nfc/pn533/pn533.c index a491db46e3bd..d9f6367b9993 100644 --- a/drivers/nfc/pn533/pn533.c +++ b/drivers/nfc/pn533/pn533.c @@ -2787,13 +2787,14 @@ void pn53x_common_clean(struct pn533 *priv) { struct pn533_cmd *cmd, *n; + /* delete the timer before cleanup the worker */ + del_timer_sync(&priv->listen_timer); + flush_delayed_work(&priv->poll_work); destroy_workqueue(priv->wq); skb_queue_purge(&priv->resp_q); - del_timer(&priv->listen_timer); - list_for_each_entry_safe(cmd, n, &priv->cmd_queue, queue) { list_del(&cmd->queue); kfree(cmd); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e1846d04817f..1a984045e49c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3146,6 +3146,7 @@ static const struct file_operations nvme_dev_fops = { .release = nvme_dev_release, .unlocked_ioctl = nvme_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, + .uring_cmd = nvme_dev_uring_cmd, }; static ssize_t nvme_sysfs_reset(struct device *dev, @@ -3699,6 +3700,7 @@ static const struct file_operations nvme_ns_chr_fops = { .release = nvme_ns_chr_release, .unlocked_ioctl = nvme_ns_chr_ioctl, .compat_ioctl = compat_ptr_ioctl, + .uring_cmd = nvme_ns_chr_uring_cmd, }; static int nvme_add_ns_cdev(struct nvme_ns *ns) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 554566371ffa..096b1b47d750 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -5,6 +5,7 @@ */ #include <linux/ptrace.h> /* for force_successful_syscall_return */ #include <linux/nvme_ioctl.h> +#include <linux/io_uring.h> #include "nvme.h" /* @@ -53,10 +54,21 @@ out: return ERR_PTR(ret); } -static int nvme_submit_user_cmd(struct request_queue *q, +static int nvme_finish_user_metadata(struct request *req, void __user *ubuf, + void *meta, unsigned len, int ret) +{ + if (!ret && req_op(req) == REQ_OP_DRV_IN && + copy_to_user(ubuf, meta, len)) + ret = -EFAULT; + kfree(meta); + return ret; +} + +static struct request *nvme_alloc_user_request(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, u64 *result, unsigned timeout, bool vec) + u32 meta_seed, void **metap, unsigned timeout, bool vec, + unsigned int rq_flags, blk_mq_req_flags_t blk_flags) { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; @@ -66,9 +78,9 @@ static int nvme_submit_user_cmd(struct request_queue *q, void *meta = NULL; int ret; - req = blk_mq_alloc_request(q, nvme_req_op(cmd), 0); + req = blk_mq_alloc_request(q, nvme_req_op(cmd) | rq_flags, blk_flags); if (IS_ERR(req)) - return PTR_ERR(req); + return req; nvme_init_request(req, cmd); if (timeout) @@ -105,26 +117,50 @@ static int nvme_submit_user_cmd(struct request_queue *q, goto out_unmap; } req->cmd_flags |= REQ_INTEGRITY; + *metap = meta; } } + return req; + +out_unmap: + if (bio) + blk_rq_unmap_user(bio); +out: + blk_mq_free_request(req); + return ERR_PTR(ret); +} + +static int nvme_submit_user_cmd(struct request_queue *q, + struct nvme_command *cmd, void __user *ubuffer, + unsigned bufflen, void __user *meta_buffer, unsigned meta_len, + u32 meta_seed, u64 *result, unsigned timeout, bool vec) +{ + struct request *req; + void *meta = NULL; + struct bio *bio; + int ret; + + req = nvme_alloc_user_request(q, cmd, ubuffer, bufflen, meta_buffer, + meta_len, meta_seed, &meta, timeout, vec, 0, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + bio = req->bio; + ret = nvme_execute_passthru_rq(req); + if (result) *result = le64_to_cpu(nvme_req(req)->result.u64); - if (meta && !ret && !write) { - if (copy_to_user(meta_buffer, meta, meta_len)) - ret = -EFAULT; - } - kfree(meta); - out_unmap: + if (meta) + ret = nvme_finish_user_metadata(req, meta_buffer, meta, + meta_len, ret); if (bio) blk_rq_unmap_user(bio); - out: blk_mq_free_request(req); return ret; } - static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) { struct nvme_user_io io; @@ -296,6 +332,139 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return status; } +struct nvme_uring_data { + __u64 metadata; + __u64 addr; + __u32 data_len; + __u32 metadata_len; + __u32 timeout_ms; +}; + +/* + * This overlays struct io_uring_cmd pdu. + * Expect build errors if this grows larger than that. + */ +struct nvme_uring_cmd_pdu { + union { + struct bio *bio; + struct request *req; + }; + void *meta; /* kernel-resident buffer */ + void __user *meta_buffer; + u32 meta_len; +}; + +static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu( + struct io_uring_cmd *ioucmd) +{ + return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; +} + +static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd) +{ + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); + struct request *req = pdu->req; + struct bio *bio = req->bio; + int status; + u64 result; + + if (nvme_req(req)->flags & NVME_REQ_CANCELLED) + status = -EINTR; + else + status = nvme_req(req)->status; + + result = le64_to_cpu(nvme_req(req)->result.u64); + + if (pdu->meta) + status = nvme_finish_user_metadata(req, pdu->meta_buffer, + pdu->meta, pdu->meta_len, status); + if (bio) + blk_rq_unmap_user(bio); + blk_mq_free_request(req); + + io_uring_cmd_done(ioucmd, status, result); +} + +static void nvme_uring_cmd_end_io(struct request *req, blk_status_t err) +{ + struct io_uring_cmd *ioucmd = req->end_io_data; + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); + /* extract bio before reusing the same field for request */ + struct bio *bio = pdu->bio; + + pdu->req = req; + req->bio = bio; + /* this takes care of moving rest of completion-work to task context */ + io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb); +} + +static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec) +{ + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); + const struct nvme_uring_cmd *cmd = ioucmd->cmd; + struct request_queue *q = ns ? ns->queue : ctrl->admin_q; + struct nvme_uring_data d; + struct nvme_command c; + struct request *req; + unsigned int rq_flags = 0; + blk_mq_req_flags_t blk_flags = 0; + void *meta = NULL; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + c.common.opcode = READ_ONCE(cmd->opcode); + c.common.flags = READ_ONCE(cmd->flags); + if (c.common.flags) + return -EINVAL; + + c.common.command_id = 0; + c.common.nsid = cpu_to_le32(cmd->nsid); + if (!nvme_validate_passthru_nsid(ctrl, ns, le32_to_cpu(c.common.nsid))) + return -EINVAL; + + c.common.cdw2[0] = cpu_to_le32(READ_ONCE(cmd->cdw2)); + c.common.cdw2[1] = cpu_to_le32(READ_ONCE(cmd->cdw3)); + c.common.metadata = 0; + c.common.dptr.prp1 = c.common.dptr.prp2 = 0; + c.common.cdw10 = cpu_to_le32(READ_ONCE(cmd->cdw10)); + c.common.cdw11 = cpu_to_le32(READ_ONCE(cmd->cdw11)); + c.common.cdw12 = cpu_to_le32(READ_ONCE(cmd->cdw12)); + c.common.cdw13 = cpu_to_le32(READ_ONCE(cmd->cdw13)); + c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14)); + c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15)); + + d.metadata = READ_ONCE(cmd->metadata); + d.addr = READ_ONCE(cmd->addr); + d.data_len = READ_ONCE(cmd->data_len); + d.metadata_len = READ_ONCE(cmd->metadata_len); + d.timeout_ms = READ_ONCE(cmd->timeout_ms); + + if (issue_flags & IO_URING_F_NONBLOCK) { + rq_flags = REQ_NOWAIT; + blk_flags = BLK_MQ_REQ_NOWAIT; + } + + req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr), + d.data_len, nvme_to_user_ptr(d.metadata), + d.metadata_len, 0, &meta, d.timeout_ms ? + msecs_to_jiffies(d.timeout_ms) : 0, vec, rq_flags, + blk_flags); + if (IS_ERR(req)) + return PTR_ERR(req); + req->end_io_data = ioucmd; + + /* to free bio on completion, as req->bio will be null at that time */ + pdu->bio = req->bio; + pdu->meta = meta; + pdu->meta_buffer = nvme_to_user_ptr(d.metadata); + pdu->meta_len = d.metadata_len; + + blk_execute_rq_nowait(req, 0, nvme_uring_cmd_end_io); + return -EIOCBQUEUED; +} + static bool is_ctrl_ioctl(unsigned int cmd) { if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) @@ -387,6 +556,53 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return __nvme_ioctl(ns, cmd, (void __user *)arg); } +static int nvme_uring_cmd_checks(unsigned int issue_flags) +{ + /* IOPOLL not supported yet */ + if (issue_flags & IO_URING_F_IOPOLL) + return -EOPNOTSUPP; + + /* NVMe passthrough requires big SQE/CQE support */ + if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) != + (IO_URING_F_SQE128|IO_URING_F_CQE32)) + return -EOPNOTSUPP; + return 0; +} + +static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd, + unsigned int issue_flags) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + int ret; + + BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); + + ret = nvme_uring_cmd_checks(issue_flags); + if (ret) + return ret; + + switch (ioucmd->cmd_op) { + case NVME_URING_CMD_IO: + ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, false); + break; + case NVME_URING_CMD_IO_VEC: + ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, true); + break; + default: + ret = -ENOTTY; + } + + return ret; +} + +int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) +{ + struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev, + struct nvme_ns, cdev); + + return nvme_ns_uring_cmd(ns, ioucmd, issue_flags); +} + #ifdef CONFIG_NVME_MULTIPATH static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp, struct nvme_ns_head *head, int srcu_idx) @@ -453,8 +669,46 @@ out_unlock: srcu_read_unlock(&head->srcu, srcu_idx); return ret; } + +int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, + unsigned int issue_flags) +{ + struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; + struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); + int srcu_idx = srcu_read_lock(&head->srcu); + struct nvme_ns *ns = nvme_find_path(head); + int ret = -EINVAL; + + if (ns) + ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags); + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} #endif /* CONFIG_NVME_MULTIPATH */ +int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) +{ + struct nvme_ctrl *ctrl = ioucmd->file->private_data; + int ret; + + ret = nvme_uring_cmd_checks(issue_flags); + if (ret) + return ret; + + switch (ioucmd->cmd_op) { + case NVME_URING_CMD_ADMIN: + ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, false); + break; + case NVME_URING_CMD_ADMIN_VEC: + ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, true); + break; + default: + ret = -ENOTTY; + } + + return ret; +} + static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) { struct nvme_ns *ns; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d464fdf978fb..d3e2440d8abb 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -437,6 +437,7 @@ static const struct file_operations nvme_ns_head_chr_fops = { .release = nvme_ns_head_chr_release, .unlocked_ioctl = nvme_ns_head_chr_ioctl, .compat_ioctl = compat_ptr_ioctl, + .uring_cmd = nvme_ns_head_chr_uring_cmd, }; static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a2b53ca63335..26d35c557588 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -782,7 +782,12 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long nvme_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, + unsigned int issue_flags); +int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, + unsigned int issue_flags); int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); +int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); extern const struct attribute_group *nvme_ns_id_attr_groups[]; extern const struct pr_ops nvme_pr_ops; diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index 6ab90891801d..816028c0f6ed 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -1550,6 +1550,11 @@ static const struct qcom_pcie_cfg sc7280_cfg = { .pipe_clk_need_muxing = true, }; +static const struct qcom_pcie_cfg sc8180x_cfg = { + .ops = &ops_1_9_0, + .has_tbu_clk = true, +}; + static const struct dw_pcie_ops dw_pcie_ops = { .link_up = qcom_pcie_link_up, .start_link = qcom_pcie_start_link, @@ -1656,7 +1661,7 @@ static const struct of_device_id qcom_pcie_match[] = { { .compatible = "qcom,pcie-qcs404", .data = &ipq4019_cfg }, { .compatible = "qcom,pcie-sdm845", .data = &sdm845_cfg }, { .compatible = "qcom,pcie-sm8250", .data = &sm8250_cfg }, - { .compatible = "qcom,pcie-sc8180x", .data = &sm8250_cfg }, + { .compatible = "qcom,pcie-sc8180x", .data = &sc8180x_cfg }, { .compatible = "qcom,pcie-sm8450-pcie0", .data = &sm8450_pcie0_cfg }, { .compatible = "qcom,pcie-sm8450-pcie1", .data = &sm8450_pcie1_cfg }, { .compatible = "qcom,pcie-sc7280", .data = &sc7280_cfg }, diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 09d9bf465d72..ffec82c8a523 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -272,7 +272,6 @@ struct advk_pcie { u32 actions; } wins[OB_WIN_COUNT]; u8 wins_count; - int irq; struct irq_domain *rp_irq_domain; struct irq_domain *irq_domain; struct irq_chip irq_chip; @@ -1570,26 +1569,21 @@ static void advk_pcie_handle_int(struct advk_pcie *pcie) } } -static void advk_pcie_irq_handler(struct irq_desc *desc) +static irqreturn_t advk_pcie_irq_handler(int irq, void *arg) { - struct advk_pcie *pcie = irq_desc_get_handler_data(desc); - struct irq_chip *chip = irq_desc_get_chip(desc); - u32 val, mask, status; + struct advk_pcie *pcie = arg; + u32 status; - chained_irq_enter(chip, desc); + status = advk_readl(pcie, HOST_CTRL_INT_STATUS_REG); + if (!(status & PCIE_IRQ_CORE_INT)) + return IRQ_NONE; - val = advk_readl(pcie, HOST_CTRL_INT_STATUS_REG); - mask = advk_readl(pcie, HOST_CTRL_INT_MASK_REG); - status = val & ((~mask) & PCIE_IRQ_ALL_MASK); + advk_pcie_handle_int(pcie); - if (status & PCIE_IRQ_CORE_INT) { - advk_pcie_handle_int(pcie); + /* Clear interrupt */ + advk_writel(pcie, PCIE_IRQ_CORE_INT, HOST_CTRL_INT_STATUS_REG); - /* Clear interrupt */ - advk_writel(pcie, PCIE_IRQ_CORE_INT, HOST_CTRL_INT_STATUS_REG); - } - - chained_irq_exit(chip, desc); + return IRQ_HANDLED; } static int advk_pcie_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) @@ -1669,7 +1663,7 @@ static int advk_pcie_probe(struct platform_device *pdev) struct advk_pcie *pcie; struct pci_host_bridge *bridge; struct resource_entry *entry; - int ret; + int ret, irq; bridge = devm_pci_alloc_host_bridge(dev, sizeof(struct advk_pcie)); if (!bridge) @@ -1755,9 +1749,17 @@ static int advk_pcie_probe(struct platform_device *pdev) if (IS_ERR(pcie->base)) return PTR_ERR(pcie->base); - pcie->irq = platform_get_irq(pdev, 0); - if (pcie->irq < 0) - return pcie->irq; + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; + + ret = devm_request_irq(dev, irq, advk_pcie_irq_handler, + IRQF_SHARED | IRQF_NO_THREAD, "advk-pcie", + pcie); + if (ret) { + dev_err(dev, "Failed to register interrupt\n"); + return ret; + } pcie->reset_gpio = devm_gpiod_get_from_of_node(dev, dev->of_node, "reset-gpios", 0, @@ -1814,15 +1816,12 @@ static int advk_pcie_probe(struct platform_device *pdev) return ret; } - irq_set_chained_handler_and_data(pcie->irq, advk_pcie_irq_handler, pcie); - bridge->sysdata = pcie; bridge->ops = &advk_pcie_ops; bridge->map_irq = advk_pcie_map_irq; ret = pci_host_probe(bridge); if (ret < 0) { - irq_set_chained_handler_and_data(pcie->irq, NULL, NULL); advk_pcie_remove_rp_irq_domain(pcie); advk_pcie_remove_msi_irq_domain(pcie); advk_pcie_remove_irq_domain(pcie); @@ -1871,9 +1870,6 @@ static int advk_pcie_remove(struct platform_device *pdev) advk_writel(pcie, PCIE_ISR1_ALL_MASK, PCIE_ISR1_REG); advk_writel(pcie, PCIE_IRQ_ALL_MASK, HOST_CTRL_INT_STATUS_REG); - /* Remove IRQ handler */ - irq_set_chained_handler_and_data(pcie->irq, NULL, NULL); - /* Remove IRQ domains */ advk_pcie_remove_rp_irq_domain(pcie); advk_pcie_remove_msi_irq_domain(pcie); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 9ecce435fb3f..d25122fbe98a 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2920,6 +2920,16 @@ static const struct dmi_system_id bridge_d3_blacklist[] = { DMI_MATCH(DMI_BOARD_VENDOR, "Gigabyte Technology Co., Ltd."), DMI_MATCH(DMI_BOARD_NAME, "X299 DESIGNARE EX-CF"), }, + /* + * Downstream device is not accessible after putting a root port + * into D3cold and back into D0 on Elo i2. + */ + .ident = "Elo i2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Elo Touch Solutions"), + DMI_MATCH(DMI_PRODUCT_NAME, "Elo i2"), + DMI_MATCH(DMI_PRODUCT_VERSION, "RevB"), + }, }, #endif { } diff --git a/drivers/pinctrl/aspeed/pinctrl-aspeed-g6.c b/drivers/pinctrl/aspeed/pinctrl-aspeed-g6.c index a3fa03bcd9a3..80838dc54b3a 100644 --- a/drivers/pinctrl/aspeed/pinctrl-aspeed-g6.c +++ b/drivers/pinctrl/aspeed/pinctrl-aspeed-g6.c @@ -1236,18 +1236,17 @@ FUNC_GROUP_DECL(SALT8, AA12); FUNC_GROUP_DECL(WDTRST4, AA12); #define AE12 196 -SIG_EXPR_LIST_DECL_SEMG(AE12, FWSPIDQ2, FWQSPID, FWSPID, - SIG_DESC_SET(SCU438, 4)); +SIG_EXPR_LIST_DECL_SESG(AE12, FWSPIQ2, FWQSPI, SIG_DESC_SET(SCU438, 4)); SIG_EXPR_LIST_DECL_SESG(AE12, GPIOY4, GPIOY4); -PIN_DECL_(AE12, SIG_EXPR_LIST_PTR(AE12, FWSPIDQ2), +PIN_DECL_(AE12, SIG_EXPR_LIST_PTR(AE12, FWSPIQ2), SIG_EXPR_LIST_PTR(AE12, GPIOY4)); #define AF12 197 -SIG_EXPR_LIST_DECL_SEMG(AF12, FWSPIDQ3, FWQSPID, FWSPID, - SIG_DESC_SET(SCU438, 5)); +SIG_EXPR_LIST_DECL_SESG(AF12, FWSPIQ3, FWQSPI, SIG_DESC_SET(SCU438, 5)); SIG_EXPR_LIST_DECL_SESG(AF12, GPIOY5, GPIOY5); -PIN_DECL_(AF12, SIG_EXPR_LIST_PTR(AF12, FWSPIDQ3), +PIN_DECL_(AF12, SIG_EXPR_LIST_PTR(AF12, FWSPIQ3), SIG_EXPR_LIST_PTR(AF12, GPIOY5)); +FUNC_GROUP_DECL(FWQSPI, AE12, AF12); #define AC12 198 SSSF_PIN_DECL(AC12, GPIOY6, FWSPIABR, SIG_DESC_SET(SCU438, 6)); @@ -1520,9 +1519,8 @@ SIG_EXPR_LIST_DECL_SEMG(Y4, EMMCDAT7, EMMCG8, EMMC, SIG_DESC_SET(SCU404, 3)); PIN_DECL_3(Y4, GPIO18E3, FWSPIDMISO, VBMISO, EMMCDAT7); GROUP_DECL(FWSPID, Y1, Y2, Y3, Y4); -GROUP_DECL(FWQSPID, Y1, Y2, Y3, Y4, AE12, AF12); GROUP_DECL(EMMCG8, AB4, AA4, AC4, AA5, Y5, AB5, AB6, AC5, Y1, Y2, Y3, Y4); -FUNC_DECL_2(FWSPID, FWSPID, FWQSPID); +FUNC_DECL_1(FWSPID, FWSPID); FUNC_GROUP_DECL(VB, Y1, Y2, Y3, Y4); FUNC_DECL_3(EMMC, EMMCG1, EMMCG4, EMMCG8); /* @@ -1918,7 +1916,7 @@ static const struct aspeed_pin_group aspeed_g6_groups[] = { ASPEED_PINCTRL_GROUP(FSI2), ASPEED_PINCTRL_GROUP(FWSPIABR), ASPEED_PINCTRL_GROUP(FWSPID), - ASPEED_PINCTRL_GROUP(FWQSPID), + ASPEED_PINCTRL_GROUP(FWQSPI), ASPEED_PINCTRL_GROUP(FWSPIWP), ASPEED_PINCTRL_GROUP(GPIT0), ASPEED_PINCTRL_GROUP(GPIT1), @@ -2160,6 +2158,7 @@ static const struct aspeed_pin_function aspeed_g6_functions[] = { ASPEED_PINCTRL_FUNC(FSI2), ASPEED_PINCTRL_FUNC(FWSPIABR), ASPEED_PINCTRL_FUNC(FWSPID), + ASPEED_PINCTRL_FUNC(FWQSPI), ASPEED_PINCTRL_FUNC(FWSPIWP), ASPEED_PINCTRL_FUNC(GPIT0), ASPEED_PINCTRL_FUNC(GPIT1), diff --git a/drivers/pinctrl/mediatek/pinctrl-mt8365.c b/drivers/pinctrl/mediatek/pinctrl-mt8365.c index 727c65221aef..57f37a294063 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt8365.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt8365.c @@ -259,7 +259,7 @@ static const struct mtk_pin_ies_smt_set mt8365_ies_set[] = { MTK_PIN_IES_SMT_SPEC(104, 104, 0x420, 13), MTK_PIN_IES_SMT_SPEC(105, 109, 0x420, 14), MTK_PIN_IES_SMT_SPEC(110, 113, 0x420, 15), - MTK_PIN_IES_SMT_SPEC(114, 112, 0x420, 16), + MTK_PIN_IES_SMT_SPEC(114, 116, 0x420, 16), MTK_PIN_IES_SMT_SPEC(117, 119, 0x420, 17), MTK_PIN_IES_SMT_SPEC(120, 122, 0x420, 18), MTK_PIN_IES_SMT_SPEC(123, 125, 0x420, 19), diff --git a/drivers/pinctrl/pinctrl-ocelot.c b/drivers/pinctrl/pinctrl-ocelot.c index 003fb0e34153..6a956ee94494 100644 --- a/drivers/pinctrl/pinctrl-ocelot.c +++ b/drivers/pinctrl/pinctrl-ocelot.c @@ -129,6 +129,7 @@ enum { FUNC_PTP1, FUNC_PTP2, FUNC_PTP3, + FUNC_PTPSYNC_0, FUNC_PTPSYNC_1, FUNC_PTPSYNC_2, FUNC_PTPSYNC_3, @@ -252,6 +253,7 @@ static const char *const ocelot_function_names[] = { [FUNC_PTP1] = "ptp1", [FUNC_PTP2] = "ptp2", [FUNC_PTP3] = "ptp3", + [FUNC_PTPSYNC_0] = "ptpsync_0", [FUNC_PTPSYNC_1] = "ptpsync_1", [FUNC_PTPSYNC_2] = "ptpsync_2", [FUNC_PTPSYNC_3] = "ptpsync_3", @@ -983,7 +985,7 @@ LAN966X_P(31, GPIO, FC3_c, CAN1, NONE, OB_TRG, RECO_b, NON LAN966X_P(32, GPIO, FC3_c, NONE, SGPIO_a, NONE, MIIM_Sa, NONE, R); LAN966X_P(33, GPIO, FC1_b, NONE, SGPIO_a, NONE, MIIM_Sa, MIIM_b, R); LAN966X_P(34, GPIO, FC1_b, NONE, SGPIO_a, NONE, MIIM_Sa, MIIM_b, R); -LAN966X_P(35, GPIO, FC1_b, NONE, SGPIO_a, CAN0_b, NONE, NONE, R); +LAN966X_P(35, GPIO, FC1_b, PTPSYNC_0, SGPIO_a, CAN0_b, NONE, NONE, R); LAN966X_P(36, GPIO, NONE, PTPSYNC_1, NONE, CAN0_b, NONE, NONE, R); LAN966X_P(37, GPIO, FC_SHRD0, PTPSYNC_2, TWI_SLC_GATE_AD, NONE, NONE, NONE, R); LAN966X_P(38, GPIO, NONE, PTPSYNC_3, NONE, NONE, NONE, NONE, R); diff --git a/drivers/pinctrl/sunxi/pinctrl-suniv-f1c100s.c b/drivers/pinctrl/sunxi/pinctrl-suniv-f1c100s.c index 2801ca706273..b8fc88a23cf4 100644 --- a/drivers/pinctrl/sunxi/pinctrl-suniv-f1c100s.c +++ b/drivers/pinctrl/sunxi/pinctrl-suniv-f1c100s.c @@ -51,7 +51,7 @@ static const struct sunxi_desc_pin suniv_f1c100s_pins[] = { SUNXI_FUNCTION(0x3, "pwm0"), /* PWM0 */ SUNXI_FUNCTION(0x4, "i2s"), /* IN */ SUNXI_FUNCTION(0x5, "uart1"), /* RX */ - SUNXI_FUNCTION(0x6, "spi1")), /* MOSI */ + SUNXI_FUNCTION(0x6, "spi1")), /* CLK */ SUNXI_PIN(SUNXI_PINCTRL_PIN(A, 3), SUNXI_FUNCTION(0x0, "gpio_in"), SUNXI_FUNCTION(0x1, "gpio_out"), @@ -204,7 +204,7 @@ static const struct sunxi_desc_pin suniv_f1c100s_pins[] = { SUNXI_FUNCTION(0x0, "gpio_in"), SUNXI_FUNCTION(0x1, "gpio_out"), SUNXI_FUNCTION(0x2, "lcd"), /* D20 */ - SUNXI_FUNCTION(0x3, "lvds1"), /* RX */ + SUNXI_FUNCTION(0x3, "uart2"), /* RX */ SUNXI_FUNCTION_IRQ_BANK(0x6, 0, 14)), SUNXI_PIN(SUNXI_PINCTRL_PIN(D, 15), SUNXI_FUNCTION(0x0, "gpio_in"), diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c index dd45471f6780..860672d6a03c 100644 --- a/drivers/ptp/ptp_ocp.c +++ b/drivers/ptp/ptp_ocp.c @@ -300,7 +300,7 @@ struct ptp_ocp { struct platform_device *spi_flash; struct clk_hw *i2c_clk; struct timer_list watchdog; - const struct ocp_attr_group *attr_tbl; + const struct attribute_group **attr_group; const struct ptp_ocp_eeprom_map *eeprom_map; struct dentry *debug_root; time64_t gnss_lost; @@ -841,7 +841,7 @@ __ptp_ocp_adjtime_locked(struct ptp_ocp *bp, u32 adj_val) } static void -ptp_ocp_adjtime_coarse(struct ptp_ocp *bp, u64 delta_ns) +ptp_ocp_adjtime_coarse(struct ptp_ocp *bp, s64 delta_ns) { struct timespec64 ts; unsigned long flags; @@ -850,7 +850,8 @@ ptp_ocp_adjtime_coarse(struct ptp_ocp *bp, u64 delta_ns) spin_lock_irqsave(&bp->lock, flags); err = __ptp_ocp_gettime_locked(bp, &ts, NULL); if (likely(!err)) { - timespec64_add_ns(&ts, delta_ns); + set_normalized_timespec64(&ts, ts.tv_sec, + ts.tv_nsec + delta_ns); __ptp_ocp_settime_locked(bp, &ts); } spin_unlock_irqrestore(&bp->lock, flags); @@ -1836,6 +1837,42 @@ ptp_ocp_signal_init(struct ptp_ocp *bp) } static void +ptp_ocp_attr_group_del(struct ptp_ocp *bp) +{ + sysfs_remove_groups(&bp->dev.kobj, bp->attr_group); + kfree(bp->attr_group); +} + +static int +ptp_ocp_attr_group_add(struct ptp_ocp *bp, + const struct ocp_attr_group *attr_tbl) +{ + int count, i; + int err; + + count = 0; + for (i = 0; attr_tbl[i].cap; i++) + if (attr_tbl[i].cap & bp->fw_cap) + count++; + + bp->attr_group = kcalloc(count + 1, sizeof(struct attribute_group *), + GFP_KERNEL); + if (!bp->attr_group) + return -ENOMEM; + + count = 0; + for (i = 0; attr_tbl[i].cap; i++) + if (attr_tbl[i].cap & bp->fw_cap) + bp->attr_group[count++] = attr_tbl[i].group; + + err = sysfs_create_groups(&bp->dev.kobj, bp->attr_group); + if (err) + bp->attr_group[0] = NULL; + + return err; +} + +static void ptp_ocp_sma_init(struct ptp_ocp *bp) { u32 reg; @@ -1904,7 +1941,6 @@ ptp_ocp_fb_board_init(struct ptp_ocp *bp, struct ocp_resource *r) bp->flash_start = 1024 * 4096; bp->eeprom_map = fb_eeprom_map; bp->fw_version = ioread32(&bp->image->version); - bp->attr_tbl = fb_timecard_groups; bp->fw_cap = OCP_CAP_BASIC; ver = bp->fw_version & 0xffff; @@ -1918,6 +1954,10 @@ ptp_ocp_fb_board_init(struct ptp_ocp *bp, struct ocp_resource *r) ptp_ocp_sma_init(bp); ptp_ocp_signal_init(bp); + err = ptp_ocp_attr_group_add(bp, fb_timecard_groups); + if (err) + return err; + err = ptp_ocp_fb_set_pins(bp); if (err) return err; @@ -3388,7 +3428,6 @@ ptp_ocp_complete(struct ptp_ocp *bp) { struct pps_device *pps; char buf[32]; - int i, err; if (bp->gnss_port != -1) { sprintf(buf, "ttyS%d", bp->gnss_port); @@ -3413,14 +3452,6 @@ ptp_ocp_complete(struct ptp_ocp *bp) if (pps) ptp_ocp_symlink(bp, pps->dev, "pps"); - for (i = 0; bp->attr_tbl[i].cap; i++) { - if (!(bp->attr_tbl[i].cap & bp->fw_cap)) - continue; - err = sysfs_create_group(&bp->dev.kobj, bp->attr_tbl[i].group); - if (err) - return err; - } - ptp_ocp_debugfs_add_device(bp); return 0; @@ -3492,15 +3523,11 @@ static void ptp_ocp_detach_sysfs(struct ptp_ocp *bp) { struct device *dev = &bp->dev; - int i; sysfs_remove_link(&dev->kobj, "ttyGNSS"); sysfs_remove_link(&dev->kobj, "ttyMAC"); sysfs_remove_link(&dev->kobj, "ptp"); sysfs_remove_link(&dev->kobj, "pps"); - if (bp->attr_tbl) - for (i = 0; bp->attr_tbl[i].cap; i++) - sysfs_remove_group(&dev->kobj, bp->attr_tbl[i].group); } static void @@ -3510,6 +3537,7 @@ ptp_ocp_detach(struct ptp_ocp *bp) ptp_ocp_debugfs_remove_device(bp); ptp_ocp_detach_sysfs(bp); + ptp_ocp_attr_group_del(bp); if (timer_pending(&bp->watchdog)) del_timer_sync(&bp->watchdog); if (bp->ts0) diff --git a/drivers/rtc/rtc-sun6i.c b/drivers/rtc/rtc-sun6i.c index 5b3e4da63406..5252ce4cbda4 100644 --- a/drivers/rtc/rtc-sun6i.c +++ b/drivers/rtc/rtc-sun6i.c @@ -370,6 +370,23 @@ CLK_OF_DECLARE_DRIVER(sun8i_h3_rtc_clk, "allwinner,sun8i-h3-rtc", CLK_OF_DECLARE_DRIVER(sun50i_h5_rtc_clk, "allwinner,sun50i-h5-rtc", sun8i_h3_rtc_clk_init); +static const struct sun6i_rtc_clk_data sun50i_h6_rtc_data = { + .rc_osc_rate = 16000000, + .fixed_prescaler = 32, + .has_prescaler = 1, + .has_out_clk = 1, + .export_iosc = 1, + .has_losc_en = 1, + .has_auto_swt = 1, +}; + +static void __init sun50i_h6_rtc_clk_init(struct device_node *node) +{ + sun6i_rtc_clk_init(node, &sun50i_h6_rtc_data); +} +CLK_OF_DECLARE_DRIVER(sun50i_h6_rtc_clk, "allwinner,sun50i-h6-rtc", + sun50i_h6_rtc_clk_init); + /* * The R40 user manual is self-conflicting on whether the prescaler is * fixed or configurable. The clock diagram shows it as fixed, but there diff --git a/drivers/scsi/ufs/ufshpb.c b/drivers/scsi/ufs/ufshpb.c index 81099b68bbfb..588c0329b80c 100644 --- a/drivers/scsi/ufs/ufshpb.c +++ b/drivers/scsi/ufs/ufshpb.c @@ -1254,6 +1254,13 @@ void ufshpb_rsp_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) struct utp_hpb_rsp *rsp_field = &lrbp->ucd_rsp_ptr->hr; int data_seg_len; + data_seg_len = be32_to_cpu(lrbp->ucd_rsp_ptr->header.dword_2) + & MASK_RSP_UPIU_DATA_SEG_LEN; + + /* If data segment length is zero, rsp_field is not valid */ + if (!data_seg_len) + return; + if (unlikely(lrbp->lun != rsp_field->lun)) { struct scsi_device *sdev; bool found = false; @@ -1288,18 +1295,6 @@ void ufshpb_rsp_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) return; } - data_seg_len = be32_to_cpu(lrbp->ucd_rsp_ptr->header.dword_2) - & MASK_RSP_UPIU_DATA_SEG_LEN; - - /* To flush remained rsp_list, we queue the map_work task */ - if (!data_seg_len) { - if (!ufshpb_is_general_lun(hpb->lun)) - return; - - ufshpb_kick_map_work(hpb); - return; - } - BUILD_BUG_ON(sizeof(struct utp_hpb_rsp) != UTP_HPB_RSP_SIZE); if (!ufshpb_is_hpb_rsp_valid(hba, lrbp, rsp_field)) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 6fe6a6bab3f4..ddf6c2a7212b 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -3596,10 +3596,7 @@ static int iscsit_send_reject( void iscsit_thread_get_cpumask(struct iscsi_conn *conn) { int ord, cpu; - cpumask_t conn_allowed_cpumask; - - cpumask_and(&conn_allowed_cpumask, iscsit_global->allowed_cpumask, - cpu_online_mask); + cpumask_var_t conn_allowed_cpumask; /* * bitmap_id is assigned from iscsit_global->ts_bitmap from @@ -3609,13 +3606,28 @@ void iscsit_thread_get_cpumask(struct iscsi_conn *conn) * iSCSI connection's RX/TX threads will be scheduled to * execute upon. */ - cpumask_clear(conn->conn_cpumask); - ord = conn->bitmap_id % cpumask_weight(&conn_allowed_cpumask); - for_each_cpu(cpu, &conn_allowed_cpumask) { - if (ord-- == 0) { - cpumask_set_cpu(cpu, conn->conn_cpumask); - return; + if (!zalloc_cpumask_var(&conn_allowed_cpumask, GFP_KERNEL)) { + ord = conn->bitmap_id % cpumask_weight(cpu_online_mask); + for_each_online_cpu(cpu) { + if (ord-- == 0) { + cpumask_set_cpu(cpu, conn->conn_cpumask); + return; + } + } + } else { + cpumask_and(conn_allowed_cpumask, iscsit_global->allowed_cpumask, + cpu_online_mask); + + cpumask_clear(conn->conn_cpumask); + ord = conn->bitmap_id % cpumask_weight(conn_allowed_cpumask); + for_each_cpu(cpu, conn_allowed_cpumask) { + if (ord-- == 0) { + cpumask_set_cpu(cpu, conn->conn_cpumask); + free_cpumask_var(conn_allowed_cpumask); + return; + } } + free_cpumask_var(conn_allowed_cpumask); } /* * This should never be reached.. diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c index 0cedcfe207b5..57b4fd56d92a 100644 --- a/drivers/target/iscsi/iscsi_target_configfs.c +++ b/drivers/target/iscsi/iscsi_target_configfs.c @@ -1137,23 +1137,27 @@ static ssize_t lio_target_wwn_cpus_allowed_list_show( static ssize_t lio_target_wwn_cpus_allowed_list_store( struct config_item *item, const char *page, size_t count) { - int ret; + int ret = -ENOMEM; char *orig; - cpumask_t new_allowed_cpumask; + cpumask_var_t new_allowed_cpumask; + + if (!zalloc_cpumask_var(&new_allowed_cpumask, GFP_KERNEL)) + goto out; orig = kstrdup(page, GFP_KERNEL); if (!orig) - return -ENOMEM; + goto out_free_cpumask; - cpumask_clear(&new_allowed_cpumask); - ret = cpulist_parse(orig, &new_allowed_cpumask); + ret = cpulist_parse(orig, new_allowed_cpumask); + if (!ret) + cpumask_copy(iscsit_global->allowed_cpumask, + new_allowed_cpumask); kfree(orig); - if (ret != 0) - return ret; - - cpumask_copy(iscsit_global->allowed_cpumask, &new_allowed_cpumask); - return count; +out_free_cpumask: + free_cpumask_var(new_allowed_cpumask); +out: + return ret ? ret : count; } CONFIGFS_ATTR(lio_target_wwn_, cpus_allowed_list); diff --git a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c index d97f496bab9b..79931ddc582a 100644 --- a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c +++ b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c @@ -194,12 +194,31 @@ static int int3400_thermal_run_osc(acpi_handle handle, char *uuid_str, int *enab return result; } +static int set_os_uuid_mask(struct int3400_thermal_priv *priv, u32 mask) +{ + int cap = 0; + + /* + * Capability bits: + * Bit 0: set to 1 to indicate DPTF is active + * Bi1 1: set to 1 to active cooling is supported by user space daemon + * Bit 2: set to 1 to passive cooling is supported by user space daemon + * Bit 3: set to 1 to critical trip is handled by user space daemon + */ + if (mask) + cap = (priv->os_uuid_mask << 1) | 0x01; + + return int3400_thermal_run_osc(priv->adev->handle, + "b23ba85d-c8b7-3542-88de-8de2ffcfd698", + &cap); +} + static ssize_t current_uuid_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct int3400_thermal_priv *priv = dev_get_drvdata(dev); - int i; + int ret, i; for (i = 0; i < INT3400_THERMAL_MAXIMUM_UUID; ++i) { if (!strncmp(buf, int3400_thermal_uuids[i], @@ -231,19 +250,7 @@ static ssize_t current_uuid_store(struct device *dev, } if (priv->os_uuid_mask) { - int cap, ret; - - /* - * Capability bits: - * Bit 0: set to 1 to indicate DPTF is active - * Bi1 1: set to 1 to active cooling is supported by user space daemon - * Bit 2: set to 1 to passive cooling is supported by user space daemon - * Bit 3: set to 1 to critical trip is handled by user space daemon - */ - cap = ((priv->os_uuid_mask << 1) | 0x01); - ret = int3400_thermal_run_osc(priv->adev->handle, - "b23ba85d-c8b7-3542-88de-8de2ffcfd698", - &cap); + ret = set_os_uuid_mask(priv, priv->os_uuid_mask); if (ret) return ret; } @@ -469,17 +476,26 @@ static int int3400_thermal_change_mode(struct thermal_zone_device *thermal, if (mode != thermal->mode) { int enabled; + enabled = mode == THERMAL_DEVICE_ENABLED; + + if (priv->os_uuid_mask) { + if (!enabled) { + priv->os_uuid_mask = 0; + result = set_os_uuid_mask(priv, priv->os_uuid_mask); + } + goto eval_odvp; + } + if (priv->current_uuid_index < 0 || priv->current_uuid_index >= INT3400_THERMAL_MAXIMUM_UUID) return -EINVAL; - enabled = (mode == THERMAL_DEVICE_ENABLED); result = int3400_thermal_run_osc(priv->adev->handle, int3400_thermal_uuids[priv->current_uuid_index], &enabled); } - +eval_odvp: evaluate_odvp(priv); return result; diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 79001301b383..e0de44000d92 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -161,6 +161,7 @@ struct mlx5_vdpa_net { struct mlx5_flow_handle *rx_rule_mcast; bool setup; u32 cur_num_vqs; + u32 rqt_size; struct notifier_block nb; struct vdpa_callback config_cb; struct mlx5_vdpa_wq_ent cvq_ent; @@ -204,17 +205,12 @@ static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val) return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val); } -static inline u32 mlx5_vdpa_max_qps(int max_vqs) -{ - return max_vqs / 2; -} - static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev) { if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ))) return 2; - return 2 * mlx5_vdpa_max_qps(mvdev->max_vqs); + return mvdev->max_vqs; } static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx) @@ -1236,25 +1232,13 @@ static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue * static int create_rqt(struct mlx5_vdpa_net *ndev) { __be32 *list; - int max_rqt; void *rqtc; int inlen; void *in; int i, j; int err; - int num; - - if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ))) - num = 1; - else - num = ndev->cur_num_vqs / 2; - max_rqt = min_t(int, roundup_pow_of_two(num), - 1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size)); - if (max_rqt < 1) - return -EOPNOTSUPP; - - inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num); + inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + ndev->rqt_size * MLX5_ST_SZ_BYTES(rq_num); in = kzalloc(inlen, GFP_KERNEL); if (!in) return -ENOMEM; @@ -1263,12 +1247,12 @@ static int create_rqt(struct mlx5_vdpa_net *ndev) rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context); MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q); - MLX5_SET(rqtc, rqtc, rqt_max_size, max_rqt); + MLX5_SET(rqtc, rqtc, rqt_max_size, ndev->rqt_size); list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]); - for (i = 0, j = 0; i < max_rqt; i++, j += 2) - list[i] = cpu_to_be32(ndev->vqs[j % (2 * num)].virtq_id); + for (i = 0, j = 0; i < ndev->rqt_size; i++, j += 2) + list[i] = cpu_to_be32(ndev->vqs[j % ndev->cur_num_vqs].virtq_id); - MLX5_SET(rqtc, rqtc, rqt_actual_size, max_rqt); + MLX5_SET(rqtc, rqtc, rqt_actual_size, ndev->rqt_size); err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn); kfree(in); if (err) @@ -1282,19 +1266,13 @@ static int create_rqt(struct mlx5_vdpa_net *ndev) static int modify_rqt(struct mlx5_vdpa_net *ndev, int num) { __be32 *list; - int max_rqt; void *rqtc; int inlen; void *in; int i, j; int err; - max_rqt = min_t(int, roundup_pow_of_two(ndev->cur_num_vqs / 2), - 1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size)); - if (max_rqt < 1) - return -EOPNOTSUPP; - - inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num); + inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + ndev->rqt_size * MLX5_ST_SZ_BYTES(rq_num); in = kzalloc(inlen, GFP_KERNEL); if (!in) return -ENOMEM; @@ -1305,10 +1283,10 @@ static int modify_rqt(struct mlx5_vdpa_net *ndev, int num) MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q); list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]); - for (i = 0, j = 0; i < max_rqt; i++, j += 2) + for (i = 0, j = 0; i < ndev->rqt_size; i++, j += 2) list[i] = cpu_to_be32(ndev->vqs[j % num].virtq_id); - MLX5_SET(rqtc, rqtc, rqt_actual_size, max_rqt); + MLX5_SET(rqtc, rqtc, rqt_actual_size, ndev->rqt_size); err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn); kfree(in); if (err) @@ -1625,7 +1603,7 @@ static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd) newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs); if (newqps < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN || - newqps > mlx5_vdpa_max_qps(mvdev->max_vqs)) + newqps > ndev->rqt_size) break; if (ndev->cur_num_vqs == 2 * newqps) { @@ -1989,7 +1967,7 @@ static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev) int err; int i; - for (i = 0; i < 2 * mlx5_vdpa_max_qps(mvdev->max_vqs); i++) { + for (i = 0; i < mvdev->max_vqs; i++) { err = setup_vq(ndev, &ndev->vqs[i]); if (err) goto err_vq; @@ -2060,9 +2038,11 @@ static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features) ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features; if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ)) - ndev->cur_num_vqs = 2 * mlx5vdpa16_to_cpu(mvdev, ndev->config.max_virtqueue_pairs); + ndev->rqt_size = mlx5vdpa16_to_cpu(mvdev, ndev->config.max_virtqueue_pairs); else - ndev->cur_num_vqs = 2; + ndev->rqt_size = 1; + + ndev->cur_num_vqs = 2 * ndev->rqt_size; update_cvq_info(mvdev); return err; @@ -2529,7 +2509,7 @@ static void init_mvqs(struct mlx5_vdpa_net *ndev) struct mlx5_vdpa_virtqueue *mvq; int i; - for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); ++i) { + for (i = 0; i < ndev->mvdev.max_vqs; ++i) { mvq = &ndev->vqs[i]; memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri)); mvq->index = i; @@ -2671,7 +2651,8 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, return -EOPNOTSUPP; } - max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues); + max_vqs = min_t(int, MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues), + 1 << MLX5_CAP_GEN(mdev, log_max_rqt_size)); if (max_vqs < 2) { dev_warn(mdev->device, "%d virtqueues are supported. At least 2 are required\n", @@ -2742,7 +2723,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MAC); } - config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, mlx5_vdpa_max_qps(max_vqs)); + config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, max_vqs / 2); mvdev->vdev.dma_dev = &mdev->pdev->dev; err = mlx5_vdpa_alloc_resources(&ndev->mvdev); if (err) @@ -2769,7 +2750,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, ndev->nb.notifier_call = event_handler; mlx5_notifier_register(mdev, &ndev->nb); mvdev->vdev.mdev = &mgtdev->mgtdev; - err = _vdpa_register_device(&mvdev->vdev, 2 * mlx5_vdpa_max_qps(max_vqs) + 1); + err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1); if (err) goto err_reg; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 792ab5f23647..297b5db47454 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1450,13 +1450,9 @@ err: return ERR_PTR(r); } -static struct ptr_ring *get_tap_ptr_ring(int fd) +static struct ptr_ring *get_tap_ptr_ring(struct file *file) { struct ptr_ring *ring; - struct file *file = fget(fd); - - if (!file) - return NULL; ring = tun_get_tx_ring(file); if (!IS_ERR(ring)) goto out; @@ -1465,7 +1461,6 @@ static struct ptr_ring *get_tap_ptr_ring(int fd) goto out; ring = NULL; out: - fput(file); return ring; } @@ -1552,8 +1547,12 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) r = vhost_net_enable_vq(n, vq); if (r) goto err_used; - if (index == VHOST_NET_VQ_RX) - nvq->rx_ring = get_tap_ptr_ring(fd); + if (index == VHOST_NET_VQ_RX) { + if (sock) + nvq->rx_ring = get_tap_ptr_ring(sock->file); + else + nvq->rx_ring = NULL; + } oldubufs = nvq->ubufs; nvq->ubufs = ubufs; diff --git a/drivers/virt/Kconfig b/drivers/virt/Kconfig index 121b9293c737..c877da072d4d 100644 --- a/drivers/virt/Kconfig +++ b/drivers/virt/Kconfig @@ -47,4 +47,7 @@ source "drivers/virt/vboxguest/Kconfig" source "drivers/virt/nitro_enclaves/Kconfig" source "drivers/virt/acrn/Kconfig" + +source "drivers/virt/coco/efi_secret/Kconfig" + endif diff --git a/drivers/virt/Makefile b/drivers/virt/Makefile index 108d0ffcc9aa..067b5427f40f 100644 --- a/drivers/virt/Makefile +++ b/drivers/virt/Makefile @@ -9,3 +9,4 @@ obj-y += vboxguest/ obj-$(CONFIG_NITRO_ENCLAVES) += nitro_enclaves/ obj-$(CONFIG_ACRN_HSM) += acrn/ +obj-$(CONFIG_EFI_SECRET) += coco/efi_secret/ diff --git a/drivers/virt/coco/efi_secret/Kconfig b/drivers/virt/coco/efi_secret/Kconfig new file mode 100644 index 000000000000..4404d198f3b2 --- /dev/null +++ b/drivers/virt/coco/efi_secret/Kconfig @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-only +config EFI_SECRET + tristate "EFI secret area securityfs support" + depends on EFI && X86_64 + select EFI_COCO_SECRET + select SECURITYFS + help + This is a driver for accessing the EFI secret area via securityfs. + The EFI secret area is a memory area designated by the firmware for + confidential computing secret injection (for example for AMD SEV + guests). The driver exposes the secrets as files in + <securityfs>/secrets/coco. Files can be read and deleted (deleting + a file wipes the secret from memory). + + To compile this driver as a module, choose M here. + The module will be called efi_secret. diff --git a/drivers/virt/coco/efi_secret/Makefile b/drivers/virt/coco/efi_secret/Makefile new file mode 100644 index 000000000000..c7047ce804f7 --- /dev/null +++ b/drivers/virt/coco/efi_secret/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_EFI_SECRET) += efi_secret.o diff --git a/drivers/virt/coco/efi_secret/efi_secret.c b/drivers/virt/coco/efi_secret/efi_secret.c new file mode 100644 index 000000000000..e700a5ef7043 --- /dev/null +++ b/drivers/virt/coco/efi_secret/efi_secret.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * efi_secret module + * + * Copyright (C) 2022 IBM Corporation + * Author: Dov Murik <dovmurik@linux.ibm.com> + */ + +/** + * DOC: efi_secret: Allow reading EFI confidential computing (coco) secret area + * via securityfs interface. + * + * When the module is loaded (and securityfs is mounted, typically under + * /sys/kernel/security), a "secrets/coco" directory is created in securityfs. + * In it, a file is created for each secret entry. The name of each such file + * is the GUID of the secret entry, and its content is the secret data. + */ + +#include <linux/platform_device.h> +#include <linux/seq_file.h> +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/io.h> +#include <linux/security.h> +#include <linux/efi.h> +#include <linux/cacheflush.h> + +#define EFI_SECRET_NUM_FILES 64 + +struct efi_secret { + struct dentry *secrets_dir; + struct dentry *fs_dir; + struct dentry *fs_files[EFI_SECRET_NUM_FILES]; + void __iomem *secret_data; + u64 secret_data_len; +}; + +/* + * Structure of the EFI secret area + * + * Offset Length + * (bytes) (bytes) Usage + * ------- ------- ----- + * 0 16 Secret table header GUID (must be 1e74f542-71dd-4d66-963e-ef4287ff173b) + * 16 4 Length of bytes of the entire secret area + * + * 20 16 First secret entry's GUID + * 36 4 First secret entry's length in bytes (= 16 + 4 + x) + * 40 x First secret entry's data + * + * 40+x 16 Second secret entry's GUID + * 56+x 4 Second secret entry's length in bytes (= 16 + 4 + y) + * 60+x y Second secret entry's data + * + * (... and so on for additional entries) + * + * The GUID of each secret entry designates the usage of the secret data. + */ + +/** + * struct secret_header - Header of entire secret area; this should be followed + * by instances of struct secret_entry. + * @guid: Must be EFI_SECRET_TABLE_HEADER_GUID + * @len: Length in bytes of entire secret area, including header + */ +struct secret_header { + efi_guid_t guid; + u32 len; +} __attribute((packed)); + +/** + * struct secret_entry - Holds one secret entry + * @guid: Secret-specific GUID (or NULL_GUID if this secret entry was deleted) + * @len: Length of secret entry, including its guid and len fields + * @data: The secret data (full of zeros if this secret entry was deleted) + */ +struct secret_entry { + efi_guid_t guid; + u32 len; + u8 data[]; +} __attribute((packed)); + +static size_t secret_entry_data_len(struct secret_entry *e) +{ + return e->len - sizeof(*e); +} + +static struct efi_secret the_efi_secret; + +static inline struct efi_secret *efi_secret_get(void) +{ + return &the_efi_secret; +} + +static int efi_secret_bin_file_show(struct seq_file *file, void *data) +{ + struct secret_entry *e = file->private; + + if (e) + seq_write(file, e->data, secret_entry_data_len(e)); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(efi_secret_bin_file); + +/* + * Overwrite memory content with zeroes, and ensure that dirty cache lines are + * actually written back to memory, to clear out the secret. + */ +static void wipe_memory(void *addr, size_t size) +{ + memzero_explicit(addr, size); +#ifdef CONFIG_X86 + clflush_cache_range(addr, size); +#endif +} + +static int efi_secret_unlink(struct inode *dir, struct dentry *dentry) +{ + struct efi_secret *s = efi_secret_get(); + struct inode *inode = d_inode(dentry); + struct secret_entry *e = (struct secret_entry *)inode->i_private; + int i; + + if (e) { + /* Zero out the secret data */ + wipe_memory(e->data, secret_entry_data_len(e)); + e->guid = NULL_GUID; + } + + inode->i_private = NULL; + + for (i = 0; i < EFI_SECRET_NUM_FILES; i++) + if (s->fs_files[i] == dentry) + s->fs_files[i] = NULL; + + /* + * securityfs_remove tries to lock the directory's inode, but we reach + * the unlink callback when it's already locked + */ + inode_unlock(dir); + securityfs_remove(dentry); + inode_lock(dir); + + return 0; +} + +static const struct inode_operations efi_secret_dir_inode_operations = { + .lookup = simple_lookup, + .unlink = efi_secret_unlink, +}; + +static int efi_secret_map_area(struct platform_device *dev) +{ + int ret; + struct efi_secret *s = efi_secret_get(); + struct linux_efi_coco_secret_area *secret_area; + + if (efi.coco_secret == EFI_INVALID_TABLE_ADDR) { + dev_err(&dev->dev, "Secret area address is not available\n"); + return -EINVAL; + } + + secret_area = memremap(efi.coco_secret, sizeof(*secret_area), MEMREMAP_WB); + if (secret_area == NULL) { + dev_err(&dev->dev, "Could not map secret area EFI config entry\n"); + return -ENOMEM; + } + if (!secret_area->base_pa || secret_area->size < sizeof(struct secret_header)) { + dev_err(&dev->dev, + "Invalid secret area memory location (base_pa=0x%llx size=0x%llx)\n", + secret_area->base_pa, secret_area->size); + ret = -EINVAL; + goto unmap; + } + + s->secret_data = ioremap_encrypted(secret_area->base_pa, secret_area->size); + if (s->secret_data == NULL) { + dev_err(&dev->dev, "Could not map secret area\n"); + ret = -ENOMEM; + goto unmap; + } + + s->secret_data_len = secret_area->size; + ret = 0; + +unmap: + memunmap(secret_area); + return ret; +} + +static void efi_secret_securityfs_teardown(struct platform_device *dev) +{ + struct efi_secret *s = efi_secret_get(); + int i; + + for (i = (EFI_SECRET_NUM_FILES - 1); i >= 0; i--) { + securityfs_remove(s->fs_files[i]); + s->fs_files[i] = NULL; + } + + securityfs_remove(s->fs_dir); + s->fs_dir = NULL; + + securityfs_remove(s->secrets_dir); + s->secrets_dir = NULL; + + dev_dbg(&dev->dev, "Removed securityfs entries\n"); +} + +static int efi_secret_securityfs_setup(struct platform_device *dev) +{ + struct efi_secret *s = efi_secret_get(); + int ret = 0, i = 0, bytes_left; + unsigned char *ptr; + struct secret_header *h; + struct secret_entry *e; + struct dentry *dent; + char guid_str[EFI_VARIABLE_GUID_LEN + 1]; + + ptr = (void __force *)s->secret_data; + h = (struct secret_header *)ptr; + if (efi_guidcmp(h->guid, EFI_SECRET_TABLE_HEADER_GUID)) { + /* + * This is not an error: it just means that EFI defines secret + * area but it was not populated by the Guest Owner. + */ + dev_dbg(&dev->dev, "EFI secret area does not start with correct GUID\n"); + return -ENODEV; + } + if (h->len < sizeof(*h)) { + dev_err(&dev->dev, "EFI secret area reported length is too small\n"); + return -EINVAL; + } + if (h->len > s->secret_data_len) { + dev_err(&dev->dev, "EFI secret area reported length is too big\n"); + return -EINVAL; + } + + s->secrets_dir = NULL; + s->fs_dir = NULL; + memset(s->fs_files, 0, sizeof(s->fs_files)); + + dent = securityfs_create_dir("secrets", NULL); + if (IS_ERR(dent)) { + dev_err(&dev->dev, "Error creating secrets securityfs directory entry err=%ld\n", + PTR_ERR(dent)); + return PTR_ERR(dent); + } + s->secrets_dir = dent; + + dent = securityfs_create_dir("coco", s->secrets_dir); + if (IS_ERR(dent)) { + dev_err(&dev->dev, "Error creating coco securityfs directory entry err=%ld\n", + PTR_ERR(dent)); + return PTR_ERR(dent); + } + d_inode(dent)->i_op = &efi_secret_dir_inode_operations; + s->fs_dir = dent; + + bytes_left = h->len - sizeof(*h); + ptr += sizeof(*h); + while (bytes_left >= (int)sizeof(*e) && i < EFI_SECRET_NUM_FILES) { + e = (struct secret_entry *)ptr; + if (e->len < sizeof(*e) || e->len > (unsigned int)bytes_left) { + dev_err(&dev->dev, "EFI secret area is corrupted\n"); + ret = -EINVAL; + goto err_cleanup; + } + + /* Skip deleted entries (which will have NULL_GUID) */ + if (efi_guidcmp(e->guid, NULL_GUID)) { + efi_guid_to_str(&e->guid, guid_str); + + dent = securityfs_create_file(guid_str, 0440, s->fs_dir, (void *)e, + &efi_secret_bin_file_fops); + if (IS_ERR(dent)) { + dev_err(&dev->dev, "Error creating efi_secret securityfs entry\n"); + ret = PTR_ERR(dent); + goto err_cleanup; + } + + s->fs_files[i++] = dent; + } + ptr += e->len; + bytes_left -= e->len; + } + + dev_info(&dev->dev, "Created %d entries in securityfs secrets/coco\n", i); + return 0; + +err_cleanup: + efi_secret_securityfs_teardown(dev); + return ret; +} + +static void efi_secret_unmap_area(void) +{ + struct efi_secret *s = efi_secret_get(); + + if (s->secret_data) { + iounmap(s->secret_data); + s->secret_data = NULL; + s->secret_data_len = 0; + } +} + +static int efi_secret_probe(struct platform_device *dev) +{ + int ret; + + ret = efi_secret_map_area(dev); + if (ret) + return ret; + + ret = efi_secret_securityfs_setup(dev); + if (ret) + goto err_unmap; + + return ret; + +err_unmap: + efi_secret_unmap_area(); + return ret; +} + +static int efi_secret_remove(struct platform_device *dev) +{ + efi_secret_securityfs_teardown(dev); + efi_secret_unmap_area(); + return 0; +} + +static struct platform_driver efi_secret_driver = { + .probe = efi_secret_probe, + .remove = efi_secret_remove, + .driver = { + .name = "efi_secret", + }, +}; + +module_platform_driver(efi_secret_driver); + +MODULE_DESCRIPTION("Confidential computing EFI secret area access"); +MODULE_AUTHOR("IBM"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:efi_secret"); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 2fe402483ad5..30b066299d39 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -740,10 +740,22 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, { struct inode *inode = d_inode(path->dentry); struct afs_vnode *vnode = AFS_FS_I(inode); - int seq = 0; + struct key *key; + int ret, seq = 0; _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); + if (!(query_flags & AT_STATX_DONT_SYNC) && + !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) + return PTR_ERR(key); + ret = afs_validate(vnode, key); + key_put(key); + if (ret < 0) + return ret; + } + do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); generic_fillattr(&init_user_ns, inode, stat); diff --git a/fs/internal.h b/fs/internal.h index 08503dc68d2b..9a6c233ee7f1 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -191,3 +191,32 @@ long splice_file_to_pipe(struct file *in, struct pipe_inode_info *opipe, loff_t *offset, size_t len, unsigned int flags); + +/* + * fs/xattr.c: + */ +struct xattr_name { + char name[XATTR_NAME_MAX + 1]; +}; + +struct xattr_ctx { + /* Value of attribute */ + union { + const void __user *cvalue; + void __user *value; + }; + void *kvalue; + size_t size; + /* Attribute name */ + struct xattr_name *kname; + unsigned int flags; +}; + + +ssize_t do_getxattr(struct user_namespace *mnt_userns, + struct dentry *d, + struct xattr_ctx *ctx); + +int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); +int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct xattr_ctx *ctx); diff --git a/fs/io-wq.c b/fs/io-wq.c index 32aeb2c581c5..824623bcf1a5 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -871,7 +871,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe, static bool io_wq_worker_wake(struct io_worker *worker, void *data) { - set_notify_signal(worker->task); + __set_notify_signal(worker->task); wake_up_process(worker->task); return false; } @@ -991,7 +991,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker, { if (work && match->fn(work, match->data)) { work->flags |= IO_WQ_WORK_CANCEL; - set_notify_signal(worker->task); + __set_notify_signal(worker->task); return true; } diff --git a/fs/io-wq.h b/fs/io-wq.h index dbecd27656c7..ba6eee76d028 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) struct io_wq_work { struct io_wq_work_node list; unsigned flags; + int cancel_seq; }; static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) diff --git a/fs/io_uring.c b/fs/io_uring.c index 91de361ea9ab..9f1c682d7caf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -80,6 +80,7 @@ #include <linux/io_uring.h> #include <linux/audit.h> #include <linux/security.h> +#include <linux/xattr.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -94,7 +95,7 @@ #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 /* only define max */ -#define IORING_MAX_FIXED_FILES (1U << 15) +#define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -113,6 +114,11 @@ #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA) +#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ + IO_REQ_CLEAN_FLAGS) + +#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) + #define IO_TCTX_REFS_CACHE_NR (1U << 10) struct io_uring { @@ -166,7 +172,7 @@ struct io_rings { * The application needs a full memory barrier before checking * for IORING_SQ_NEED_WAKEUP after updating the sq tail. */ - u32 sq_flags; + atomic_t sq_flags; /* * Runtime CQ flags * @@ -198,13 +204,6 @@ struct io_rings { struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; }; -enum io_uring_cmd_flags { - IO_URING_F_COMPLETE_DEFER = 1, - IO_URING_F_UNLOCKED = 2, - /* int's last bit, sign checks are usually faster than a bit test */ - IO_URING_F_NONBLOCK = INT_MIN, -}; - struct io_mapped_ubuf { u64 ubuf; u64 ubuf_end; @@ -216,10 +215,27 @@ struct io_mapped_ubuf { struct io_ring_ctx; struct io_overflow_cqe { - struct io_uring_cqe cqe; struct list_head list; + struct io_uring_cqe cqe; }; +/* + * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 + * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we + * can't safely always dereference the file when the task has exited and ring + * cleanup is done. If a file is tracked and part of SCM, then unix gc on + * process exit may reap it before __io_sqe_files_unregister() is run. + */ +#define FFS_NOWAIT 0x1UL +#define FFS_ISREG 0x2UL +#if defined(CONFIG_64BIT) +#define FFS_SCM 0x4UL +#else +#define IO_URING_SCM_ALL +#define FFS_SCM 0x0UL +#endif +#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) + struct io_fixed_file { /* file * with additional FFS_* flags */ unsigned long file_ptr; @@ -237,6 +253,8 @@ struct io_rsrc_put { struct io_file_table { struct io_fixed_file *files; + unsigned long *bitmap; + unsigned int alloc_hint; }; struct io_rsrc_node { @@ -261,10 +279,26 @@ struct io_rsrc_data { bool quiesce; }; +#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) struct io_buffer_list { - struct list_head list; - struct list_head buf_list; + /* + * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, + * then these are classic provided buffers and ->buf_list is used. + */ + union { + struct list_head buf_list; + struct { + struct page **buf_pages; + struct io_uring_buf_ring *buf_ring; + }; + }; __u16 bgid; + + /* below is for ring provided buffers */ + __u16 buf_nr_pages; + __u16 nr_entries; + __u32 head; + __u32 mask; }; struct io_buffer { @@ -337,7 +371,7 @@ struct io_ev_fd { struct rcu_head rcu; }; -#define IO_BUFFERS_HASH_BITS 5 +#define BGID_ARRAY 64 struct io_ring_ctx { /* const or read-mostly hot data */ @@ -346,6 +380,7 @@ struct io_ring_ctx { struct io_rings *rings; unsigned int flags; + enum task_work_notify_mode notify_method; unsigned int compat: 1; unsigned int drain_next: 1; unsigned int restricted: 1; @@ -353,6 +388,7 @@ struct io_ring_ctx { unsigned int drain_active: 1; unsigned int drain_disabled: 1; unsigned int has_evfd: 1; + unsigned int syscall_iopoll: 1; } ____cacheline_aligned_in_smp; /* submission data */ @@ -382,17 +418,21 @@ struct io_ring_ctx { */ struct io_rsrc_node *rsrc_node; int rsrc_cached_refs; + atomic_t cancel_seq; struct io_file_table file_table; unsigned nr_user_files; unsigned nr_user_bufs; struct io_mapped_ubuf **user_bufs; struct io_submit_state submit_state; + + struct io_buffer_list *io_bl; + struct xarray io_bl_xa; + struct list_head io_buffers_cache; + struct list_head timeout_list; struct list_head ltimeout_list; struct list_head cq_overflow_list; - struct list_head *io_buffers; - struct list_head io_buffers_cache; struct list_head apoll_cache; struct xarray personalities; u32 pers_next; @@ -409,9 +449,16 @@ struct io_ring_ctx { struct wait_queue_head sqo_sq_wait; struct list_head sqd_list; - unsigned long check_cq_overflow; + unsigned long check_cq; struct { + /* + * We cache a range of free CQEs we can use, once exhausted it + * should go through a slower range setup, see __io_get_cqe() + */ + struct io_uring_cqe *cqe_cached; + struct io_uring_cqe *cqe_sentinel; + unsigned cached_cq_tail; unsigned cq_entries; struct io_ev_fd __rcu *io_ev_fd; @@ -497,7 +544,7 @@ struct io_uring_task { spinlock_t task_lock; struct io_wq_work_list task_list; - struct io_wq_work_list prior_task_list; + struct io_wq_work_list prio_task_list; struct callback_head task_work; struct file **registered_rings; bool task_running; @@ -546,6 +593,16 @@ struct io_accept { unsigned long nofile; }; +struct io_socket { + struct file *file; + int domain; + int type; + int protocol; + int flags; + u32 file_slot; + unsigned long nofile; +}; + struct io_sync { struct file *file; loff_t len; @@ -557,6 +614,8 @@ struct io_sync { struct io_cancel { struct file *file; u64 addr; + u32 flags; + s32 fd; }; struct io_timeout { @@ -585,7 +644,7 @@ struct io_rw { struct kiocb kiocb; u64 addr; u32 len; - u32 flags; + rwf_t flags; }; struct io_connect { @@ -602,9 +661,9 @@ struct io_sr_msg { void __user *buf; }; int msg_flags; - int bgid; size_t len; size_t done_io; + unsigned int flags; }; struct io_open { @@ -722,6 +781,12 @@ struct io_msg { u32 len; }; +struct io_nop { + struct file *file; + u64 extra1; + u64 extra2; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -748,6 +813,12 @@ struct io_async_rw { struct wait_page_queue wpq; }; +struct io_xattr { + struct file *file; + struct xattr_ctx ctx; + struct filename *filename; +}; + enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, @@ -766,6 +837,7 @@ enum { REQ_F_NEED_CLEANUP_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, + REQ_F_BUFFER_RING_BIT, REQ_F_COMPLETE_INLINE_BIT, REQ_F_REISSUE_BIT, REQ_F_CREDS_BIT, @@ -776,6 +848,7 @@ enum { REQ_F_SINGLE_POLL_BIT, REQ_F_DOUBLE_POLL_BIT, REQ_F_PARTIAL_IO_BIT, + REQ_F_APOLL_MULTISHOT_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -816,6 +889,8 @@ enum { REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), /* buffer already selected */ REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), + /* buffer selected from ring, needs commit */ + REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), /* completion is deferred through io_comp_state */ REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), /* caller should reissue async */ @@ -840,6 +915,8 @@ enum { REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), /* request has already done partial IO */ REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), + /* fast poll multishot mode */ + REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), }; struct async_poll { @@ -862,6 +939,21 @@ enum { IORING_RSRC_BUFFER = 1, }; +struct io_cqe { + __u64 user_data; + __s32 res; + /* fd initially, then cflags for completion */ + union { + __u32 flags; + int fd; + }; +}; + +enum { + IO_CHECK_CQ_OVERFLOW_BIT, + IO_CHECK_CQ_DROPPED_BIT, +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -897,46 +989,65 @@ struct io_kiocb { struct io_symlink symlink; struct io_hardlink hardlink; struct io_msg msg; + struct io_xattr xattr; + struct io_socket sock; + struct io_nop nop; + struct io_uring_cmd uring_cmd; }; u8 opcode; /* polled IO has completed */ u8 iopoll_completed; + /* + * Can be either a fixed buffer index, or used with provided buffers. + * For the latter, before issue it points to the buffer group ID, + * and after selection it points to the buffer ID itself. + */ u16 buf_index; unsigned int flags; - u64 user_data; - u32 result; - /* fd initially, then cflags for completion */ - union { - u32 cflags; - int fd; - }; + struct io_cqe cqe; struct io_ring_ctx *ctx; struct task_struct *task; - struct percpu_ref *fixed_rsrc_refs; - /* store used ubuf, so we can prevent reloading */ - struct io_mapped_ubuf *imu; + struct io_rsrc_node *rsrc_node; + + union { + /* store used ubuf, so we can prevent reloading */ + struct io_mapped_ubuf *imu; + + /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ + struct io_buffer *kbuf; + + /* + * stores buffer ID for ring provided buffers, valid IFF + * REQ_F_BUFFER_RING is set. + */ + struct io_buffer_list *buf_list; + }; union { /* used by request caches, completion batching and iopoll */ struct io_wq_work_node comp_list; /* cache ->apoll->events */ - int apoll_events; + __poll_t apoll_events; }; atomic_t refs; atomic_t poll_refs; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ - struct hlist_node hash_node; + union { + struct hlist_node hash_node; + struct { + u64 extra1; + u64 extra2; + }; + }; /* internal polling, see IORING_FEAT_FAST_POLL */ struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ void *async_data; - /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ - struct io_buffer *kbuf; /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ struct io_kiocb *link; /* custom credentials, valid IFF REQ_F_CREDS is set */ @@ -956,6 +1067,24 @@ struct io_defer_entry { u32 seq; }; +struct io_cancel_data { + struct io_ring_ctx *ctx; + union { + u64 data; + struct file *file; + }; + u32 flags; + int seq; +}; + +/* + * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into + * the following sqe if SQE128 is used. + */ +#define uring_cmd_pdu_size(is_sqe128) \ + ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \ + offsetof(struct io_uring_sqe, cmd)) + struct io_op_def { /* needs req->file assigned */ unsigned needs_file : 1; @@ -977,12 +1106,20 @@ struct io_op_def { unsigned not_supported : 1; /* skip auditing */ unsigned audit_skip : 1; + /* supports ioprio */ + unsigned ioprio : 1; + /* supports iopoll */ + unsigned iopoll : 1; /* size of async data needed, if any */ unsigned short async_size; }; static const struct io_op_def io_op_defs[] = { - [IORING_OP_NOP] = {}, + [IORING_OP_NOP] = { + .audit_skip = 1, + .iopoll = 1, + .buffer_select = 1, + }, [IORING_OP_READV] = { .needs_file = 1, .unbound_nonreg_file = 1, @@ -991,6 +1128,8 @@ static const struct io_op_def io_op_defs[] = { .needs_async_setup = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITEV] = { @@ -1001,6 +1140,8 @@ static const struct io_op_def io_op_defs[] = { .needs_async_setup = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FSYNC] = { @@ -1013,6 +1154,8 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE_FIXED] = { @@ -1022,6 +1165,8 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_POLL_ADD] = { @@ -1064,6 +1209,7 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .poll_exclusive = 1, + .ioprio = 1, /* used for flags */ }, [IORING_OP_ASYNC_CANCEL] = { .audit_skip = 1, @@ -1086,6 +1232,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_CLOSE] = {}, [IORING_OP_FILES_UPDATE] = { .audit_skip = 1, + .iopoll = 1, }, [IORING_OP_STATX] = { .audit_skip = 1, @@ -1097,6 +1244,8 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE] = { @@ -1106,6 +1255,8 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FADVISE] = { @@ -1140,9 +1291,11 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_PROVIDE_BUFFERS] = { .audit_skip = 1, + .iopoll = 1, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, + .iopoll = 1, }, [IORING_OP_TEE] = { .needs_file = 1, @@ -1160,11 +1313,30 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_LINKAT] = {}, [IORING_OP_MSG_RING] = { .needs_file = 1, + .iopoll = 1, + }, + [IORING_OP_FSETXATTR] = { + .needs_file = 1 + }, + [IORING_OP_SETXATTR] = {}, + [IORING_OP_FGETXATTR] = { + .needs_file = 1 + }, + [IORING_OP_GETXATTR] = {}, + [IORING_OP_SOCKET] = { + .audit_skip = 1, + }, + [IORING_OP_URING_CMD] = { + .needs_file = 1, + .plug = 1, + .needs_async_setup = 1, + .async_size = uring_cmd_pdu_size(1), }, }; /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) +#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) static bool io_disarm_next(struct io_kiocb *req); static void io_uring_del_tctx_node(unsigned long index); @@ -1173,10 +1345,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool cancel_all); static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags); - -static void io_put_req(struct io_kiocb *req); -static void io_put_req_deferred(struct io_kiocb *req); +static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags); static void io_dismantle_req(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, @@ -1185,10 +1354,10 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, static void io_clean_op(struct io_kiocb *req); static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); -static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd); +static struct file *io_file_get_normal(struct io_kiocb *req, int fd); static void io_drop_inflight_file(struct io_kiocb *req); static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags); -static void __io_queue_sqe(struct io_kiocb *req); +static void io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); static void io_req_task_queue(struct io_kiocb *req); @@ -1201,11 +1370,115 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static void io_eventfd_signal(struct io_ring_ctx *ctx); +static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; +const char *io_uring_get_opcode(u8 opcode) +{ + switch ((enum io_uring_op)opcode) { + case IORING_OP_NOP: + return "NOP"; + case IORING_OP_READV: + return "READV"; + case IORING_OP_WRITEV: + return "WRITEV"; + case IORING_OP_FSYNC: + return "FSYNC"; + case IORING_OP_READ_FIXED: + return "READ_FIXED"; + case IORING_OP_WRITE_FIXED: + return "WRITE_FIXED"; + case IORING_OP_POLL_ADD: + return "POLL_ADD"; + case IORING_OP_POLL_REMOVE: + return "POLL_REMOVE"; + case IORING_OP_SYNC_FILE_RANGE: + return "SYNC_FILE_RANGE"; + case IORING_OP_SENDMSG: + return "SENDMSG"; + case IORING_OP_RECVMSG: + return "RECVMSG"; + case IORING_OP_TIMEOUT: + return "TIMEOUT"; + case IORING_OP_TIMEOUT_REMOVE: + return "TIMEOUT_REMOVE"; + case IORING_OP_ACCEPT: + return "ACCEPT"; + case IORING_OP_ASYNC_CANCEL: + return "ASYNC_CANCEL"; + case IORING_OP_LINK_TIMEOUT: + return "LINK_TIMEOUT"; + case IORING_OP_CONNECT: + return "CONNECT"; + case IORING_OP_FALLOCATE: + return "FALLOCATE"; + case IORING_OP_OPENAT: + return "OPENAT"; + case IORING_OP_CLOSE: + return "CLOSE"; + case IORING_OP_FILES_UPDATE: + return "FILES_UPDATE"; + case IORING_OP_STATX: + return "STATX"; + case IORING_OP_READ: + return "READ"; + case IORING_OP_WRITE: + return "WRITE"; + case IORING_OP_FADVISE: + return "FADVISE"; + case IORING_OP_MADVISE: + return "MADVISE"; + case IORING_OP_SEND: + return "SEND"; + case IORING_OP_RECV: + return "RECV"; + case IORING_OP_OPENAT2: + return "OPENAT2"; + case IORING_OP_EPOLL_CTL: + return "EPOLL_CTL"; + case IORING_OP_SPLICE: + return "SPLICE"; + case IORING_OP_PROVIDE_BUFFERS: + return "PROVIDE_BUFFERS"; + case IORING_OP_REMOVE_BUFFERS: + return "REMOVE_BUFFERS"; + case IORING_OP_TEE: + return "TEE"; + case IORING_OP_SHUTDOWN: + return "SHUTDOWN"; + case IORING_OP_RENAMEAT: + return "RENAMEAT"; + case IORING_OP_UNLINKAT: + return "UNLINKAT"; + case IORING_OP_MKDIRAT: + return "MKDIRAT"; + case IORING_OP_SYMLINKAT: + return "SYMLINKAT"; + case IORING_OP_LINKAT: + return "LINKAT"; + case IORING_OP_MSG_RING: + return "MSG_RING"; + case IORING_OP_FSETXATTR: + return "FSETXATTR"; + case IORING_OP_SETXATTR: + return "SETXATTR"; + case IORING_OP_FGETXATTR: + return "FGETXATTR"; + case IORING_OP_GETXATTR: + return "GETXATTR"; + case IORING_OP_SOCKET: + return "SOCKET"; + case IORING_OP_URING_CMD: + return "URING_CMD"; + case IORING_OP_LAST: + return "INVALID"; + } + return "INVALID"; +} + struct sock *io_uring_get_socket(struct file *file) { #if defined(CONFIG_UNIX) @@ -1219,6 +1492,42 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +#if defined(CONFIG_UNIX) +static inline bool io_file_need_scm(struct file *filp) +{ +#if defined(IO_URING_SCM_ALL) + return true; +#else + return !!unix_get_socket(filp); +#endif +} +#else +static inline bool io_file_need_scm(struct file *filp) +{ + return false; +} +#endif + +static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags) +{ + lockdep_assert_held(&ctx->uring_lock); + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_unlock(&ctx->uring_lock); +} + +static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags) +{ + /* + * "Normal" inline submissions always hold the uring_lock, since we + * grab it from the system call. Same is true for the SQPOLL offload. + * The only exception is when we've detached the request and issue it + * from an async worker thread, grab the lock for that case. + */ + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_lock(&ctx->uring_lock); + lockdep_assert_held(&ctx->uring_lock); +} + static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) { if (!*locked) { @@ -1280,31 +1589,36 @@ static inline void io_req_set_refcount(struct io_kiocb *req) #define IO_RSRC_REF_BATCH 100 +static void io_rsrc_put_node(struct io_rsrc_node *node, int nr) +{ + percpu_ref_put_many(&node->refs, nr); +} + static inline void io_req_put_rsrc_locked(struct io_kiocb *req, struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - struct percpu_ref *ref = req->fixed_rsrc_refs; + struct io_rsrc_node *node = req->rsrc_node; - if (ref) { - if (ref == &ctx->rsrc_node->refs) + if (node) { + if (node == ctx->rsrc_node) ctx->rsrc_cached_refs++; else - percpu_ref_put(ref); + io_rsrc_put_node(node, 1); } } -static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx) +static inline void io_req_put_rsrc(struct io_kiocb *req) { - if (req->fixed_rsrc_refs) - percpu_ref_put(req->fixed_rsrc_refs); + if (req->rsrc_node) + io_rsrc_put_node(req->rsrc_node, 1); } static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { if (ctx->rsrc_cached_refs) { - percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs); + io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); ctx->rsrc_cached_refs = 0; } } @@ -1320,8 +1634,8 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, struct io_ring_ctx *ctx, unsigned int issue_flags) { - if (!req->fixed_rsrc_refs) { - req->fixed_rsrc_refs = &ctx->rsrc_node->refs; + if (!req->rsrc_node) { + req->rsrc_node = ctx->rsrc_node; if (!(issue_flags & IO_URING_F_UNLOCKED)) { lockdep_assert_held(&ctx->uring_lock); @@ -1329,28 +1643,30 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, if (unlikely(ctx->rsrc_cached_refs < 0)) io_rsrc_refs_refill(ctx); } else { - percpu_ref_get(req->fixed_rsrc_refs); + percpu_ref_get(&req->rsrc_node->refs); } } } static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) { - struct io_buffer *kbuf = req->kbuf; - unsigned int cflags; + if (req->flags & REQ_F_BUFFER_RING) { + if (req->buf_list) + req->buf_list->head++; + req->flags &= ~REQ_F_BUFFER_RING; + } else { + list_add(&req->kbuf->list, list); + req->flags &= ~REQ_F_BUFFER_SELECTED; + } - cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT); - req->flags &= ~REQ_F_BUFFER_SELECTED; - list_add(&kbuf->list, list); - req->kbuf = NULL; - return cflags; + return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); } static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) { lockdep_assert_held(&req->ctx->completion_lock); - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return 0; return __io_put_kbuf(req, &req->ctx->io_buffers_comp); } @@ -1360,7 +1676,7 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, { unsigned int cflags; - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return 0; /* @@ -1375,7 +1691,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, * We migrate buffers from the comp_list to the issue cache list * when we need one. */ - if (issue_flags & IO_URING_F_UNLOCKED) { + if (req->flags & REQ_F_BUFFER_RING) { + /* no buffers to recycle for this case */ + cflags = __io_put_kbuf(req, NULL); + } else if (issue_flags & IO_URING_F_UNLOCKED) { struct io_ring_ctx *ctx = req->ctx; spin_lock(&ctx->completion_lock); @@ -1393,15 +1712,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { - struct list_head *hash_list; - struct io_buffer_list *bl; - - hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; - list_for_each_entry(bl, hash_list, list) - if (bl->bgid == bgid || bgid == -1U) - return bl; + if (ctx->io_bl && bgid < BGID_ARRAY) + return &ctx->io_bl[bgid]; - return NULL; + return xa_load(&ctx->io_bl_xa, bgid); } static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) @@ -1410,25 +1724,33 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) struct io_buffer_list *bl; struct io_buffer *buf; - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return; /* don't recycle if we already did IO to this buffer */ if (req->flags & REQ_F_PARTIAL_IO) return; + /* + * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear + * the flag and hence ensure that bl->head doesn't get incremented. + * If the tail has already been incremented, hang on to it. + */ + if (req->flags & REQ_F_BUFFER_RING) { + if (req->buf_list) { + req->buf_index = req->buf_list->bgid; + req->flags &= ~REQ_F_BUFFER_RING; + } + return; + } - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_lock(&ctx->uring_lock); - - lockdep_assert_held(&ctx->uring_lock); + io_ring_submit_lock(ctx, issue_flags); buf = req->kbuf; bl = io_buffer_get_list(ctx, buf->bgid); list_add(&buf->list, &bl->buf_list); req->flags &= ~REQ_F_BUFFER_SELECTED; - req->kbuf = NULL; + req->buf_index = buf->bgid; - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_unlock(&ctx->uring_lock); + io_ring_submit_unlock(ctx, issue_flags); } static bool io_match_task(struct io_kiocb *head, struct task_struct *task, @@ -1469,7 +1791,12 @@ static inline void req_set_fail(struct io_kiocb *req) static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); - req->result = res; + req->cqe.res = res; +} + +static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) +{ + wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); } static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) @@ -1506,12 +1833,14 @@ static __cold void io_fallback_req_func(struct work_struct *work) static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; - int i, hash_bits; + int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return NULL; + xa_init(&ctx->io_bl_xa); + /* * Use 5 bits less than the max cq entries, that should give us around * 32 entries per hash list if totally full and uniformly spread. @@ -1533,13 +1862,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) /* set invalid range, so io_import_fixed() fails meeting it */ ctx->dummy_ubuf->ubuf = -1UL; - ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS, - sizeof(struct list_head), GFP_KERNEL); - if (!ctx->io_buffers) - goto err; - for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) - INIT_LIST_HEAD(&ctx->io_buffers[i]); - if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -1575,7 +1897,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) err: kfree(ctx->dummy_ubuf); kfree(ctx->cancel_hash); - kfree(ctx->io_buffers); + kfree(ctx->io_bl); + xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; } @@ -1599,10 +1922,6 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } -#define FFS_NOWAIT 0x1UL -#define FFS_ISREG 0x2UL -#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) - static inline bool io_req_ffs_set(struct io_kiocb *req) { return req->flags & REQ_F_FIXED_FILE; @@ -1629,6 +1948,17 @@ static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return __io_prep_linked_timeout(req); } +static noinline void __io_arm_ltimeout(struct io_kiocb *req) +{ + io_queue_linked_timeout(__io_prep_linked_timeout(req)); +} + +static inline void io_arm_ltimeout(struct io_kiocb *req) +{ + if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) + __io_arm_ltimeout(req); +} + static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1641,6 +1971,7 @@ static void io_prep_async_work(struct io_kiocb *req) req->work.list.next = NULL; req->work.flags = 0; + req->work.cancel_seq = atomic_read(&ctx->cancel_seq); if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; @@ -1672,17 +2003,15 @@ static void io_prep_async_link(struct io_kiocb *req) static inline void io_req_add_compl_list(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - struct io_submit_state *state = &ctx->submit_state; + struct io_submit_state *state = &req->ctx->submit_state; if (!(req->flags & REQ_F_CQE_SKIP)) - ctx->submit_state.flush_cqes = true; + state->flush_cqes = true; wq_list_add_tail(&req->comp_list, &state->compl_reqs); } -static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) +static void io_queue_iowq(struct io_kiocb *req, bool *dont_use) { - struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; @@ -1702,8 +2031,9 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) if (WARN_ON_ONCE(!same_thread_group(req->task, current))) req->work.flags |= IO_WQ_WORK_CANCEL; - trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags, - &req->work, io_wq_is_hashed(&req->work)); + trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data, + req->opcode, req->flags, &req->work, + io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); @@ -1721,8 +2051,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status) atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); - io_fill_cqe_req(req, status, 0); - io_put_req_deferred(req); + io_req_tw_post_queue(req, status, 0); } } @@ -1804,21 +2133,53 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); } -static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +/* + * writes to the cq entry need to come after reading head; the + * control dependency is enough as we're using WRITE_ONCE to + * fill the cq entry + */ +static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - unsigned tail, mask = ctx->cq_entries - 1; - - /* - * writes to the cq entry need to come after reading head; the - * control dependency is enough as we're using WRITE_ONCE to - * fill the cq entry - */ - if (__io_cqring_events(ctx) == ctx->cq_entries) + unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); + unsigned int shift = 0; + unsigned int free, queued, len; + + if (ctx->flags & IORING_SETUP_CQE32) + shift = 1; + + /* userspace may cheat modifying the tail, be safe and do min */ + queued = min(__io_cqring_events(ctx), ctx->cq_entries); + free = ctx->cq_entries - queued; + /* we need a contiguous range, limit based on the current array offset */ + len = min(free, ctx->cq_entries - off); + if (!len) return NULL; - tail = ctx->cached_cq_tail++; - return &rings->cqes[tail & mask]; + ctx->cached_cq_tail++; + ctx->cqe_cached = &rings->cqes[off]; + ctx->cqe_sentinel = ctx->cqe_cached + len; + ctx->cqe_cached++; + return &rings->cqes[off << shift]; +} + +static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +{ + if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { + struct io_uring_cqe *cqe = ctx->cqe_cached; + + if (ctx->flags & IORING_SETUP_CQE32) { + unsigned int off = ctx->cqe_cached - ctx->rings->cqes; + + cqe += off; + } + + ctx->cached_cq_tail++; + ctx->cqe_cached++; + return cqe; + } + + return __io_get_cqe(ctx); } static void io_eventfd_signal(struct io_ring_ctx *ctx) @@ -1889,10 +2250,14 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { bool all_flushed, posted; + size_t cqe_size = sizeof(struct io_uring_cqe); if (!force && __io_cqring_events(ctx) == ctx->cq_entries) return false; + if (ctx->flags & IORING_SETUP_CQE32) + cqe_size <<= 1; + posted = false; spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->cq_overflow_list)) { @@ -1904,7 +2269,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); if (cqe) - memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); + memcpy(cqe, &ocqe->cqe, cqe_size); else io_account_cq_overflow(ctx); @@ -1915,13 +2280,11 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) all_flushed = list_empty(&ctx->cq_overflow_list); if (all_flushed) { - clear_bit(0, &ctx->check_cq_overflow); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW); + clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); + atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } - if (posted) - io_commit_cqring(ctx); + io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); @@ -1932,7 +2295,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) { bool ret = true; - if (test_bit(0, &ctx->check_cq_overflow)) { + if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { /* iopoll syncs against uring_lock, not completion_lock */ if (ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&ctx->uring_lock); @@ -1944,19 +2307,23 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) return ret; } -/* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) +static void __io_put_task(struct task_struct *task, int nr) { struct io_uring_task *tctx = task->io_uring; - if (likely(task == current)) { - tctx->cached_refs += nr; - } else { - percpu_counter_sub(&tctx->inflight, nr); - if (unlikely(atomic_read(&tctx->in_idle))) - wake_up(&tctx->wait); - put_task_struct_many(task, nr); - } + percpu_counter_sub(&tctx->inflight, nr); + if (unlikely(atomic_read(&tctx->in_idle))) + wake_up(&tctx->wait); + put_task_struct_many(task, nr); +} + +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct task_struct *task, int nr) +{ + if (likely(task == current)) + task->io_uring->cached_refs += nr; + else + __io_put_task(task, nr); } static void io_task_refs_refill(struct io_uring_task *tctx) @@ -1990,11 +2357,18 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) + s32 res, u32 cflags, u64 extra1, + u64 extra2) { struct io_overflow_cqe *ocqe; + size_t ocq_size = sizeof(struct io_overflow_cqe); + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + + if (is_cqe32) + ocq_size += sizeof(struct io_uring_cqe); - ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT); + ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); + trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { /* * If we're in ring overflow flush mode, or in task cancel mode, @@ -2002,17 +2376,21 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, * on the floor. */ io_account_cq_overflow(ctx); + set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } if (list_empty(&ctx->cq_overflow_list)) { - set_bit(0, &ctx->check_cq_overflow); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); + set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); + atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } ocqe->cqe.user_data = user_data; ocqe->cqe.res = res; ocqe->cqe.flags = cflags; + if (is_cqe32) { + ocqe->cqe.big_cqe[0] = extra1; + ocqe->cqe.big_cqe[1] = extra2; + } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } @@ -2034,42 +2412,114 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, WRITE_ONCE(cqe->flags, cflags); return true; } - return io_cqring_event_overflow(ctx, user_data, res, cflags); + return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); +} + +static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + struct io_uring_cqe *cqe; + + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, 0, 0); + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + memcpy(cqe, &req->cqe, sizeof(*cqe)); + return true; + } + return io_cqring_event_overflow(ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, 0, 0); +} + +static inline bool __io_fill_cqe32_req_filled(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + struct io_uring_cqe *cqe; + u64 extra1 = req->extra1; + u64 extra2 = req->extra2; + + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, extra1, extra2); + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); + cqe->big_cqe[0] = extra1; + cqe->big_cqe[1] = extra2; + return true; + } + + return io_cqring_event_overflow(ctx, req->cqe.user_data, req->cqe.res, + req->cqe.flags, extra1, extra2); } static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) { - trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags); - return __io_fill_cqe(req->ctx, req->user_data, res, cflags); + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags, 0, 0); + return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags); } -static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) +static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags, + u64 extra1, u64 extra2) { - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(req, res, cflags); + struct io_ring_ctx *ctx = req->ctx; + struct io_uring_cqe *cqe; + + if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) + return; + if (req->flags & REQ_F_CQE_SKIP) + return; + + trace_io_uring_complete(ctx, req, req->cqe.user_data, res, cflags, + extra1, extra2); + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + WRITE_ONCE(cqe->user_data, req->cqe.user_data); + WRITE_ONCE(cqe->res, res); + WRITE_ONCE(cqe->flags, cflags); + WRITE_ONCE(cqe->big_cqe[0], extra1); + WRITE_ONCE(cqe->big_cqe[1], extra2); + return; + } + + io_cqring_event_overflow(ctx, req->cqe.user_data, res, cflags, extra1, extra2); } static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { ctx->cq_extra++; - trace_io_uring_complete(ctx, NULL, user_data, res, cflags); + trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); return __io_fill_cqe(ctx, user_data, res, cflags); } -static void __io_req_complete_post(struct io_kiocb *req, s32 res, - u32 cflags) +static void __io_req_complete_put(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(req, res, cflags); /* * If we're the last reference to this request, add to our locked * free_list cache. */ if (req_ref_put_and_test(req)) { - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { + struct io_ring_ctx *ctx = req->ctx; + + if (req->flags & IO_REQ_LINK_FLAGS) { if (req->flags & IO_DISARM_MASK) io_disarm_next(req); if (req->link) { @@ -2077,7 +2527,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, req->link = NULL; } } - io_req_put_rsrc(req, ctx); + io_req_put_rsrc(req); /* * Selected buffer deallocation in io_clean_op() assumes that * we don't hold ->completion_lock. Clean them here to avoid @@ -2091,8 +2541,23 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, } } -static void io_req_complete_post(struct io_kiocb *req, s32 res, - u32 cflags) +static void __io_req_complete_post(struct io_kiocb *req, s32 res, + u32 cflags) +{ + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe_req(req, res, cflags); + __io_req_complete_put(req); +} + +static void __io_req_complete_post32(struct io_kiocb *req, s32 res, + u32 cflags, u64 extra1, u64 extra2) +{ + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe32_req(req, res, cflags, extra1, extra2); + __io_req_complete_put(req); +} + +static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags) { struct io_ring_ctx *ctx = req->ctx; @@ -2103,11 +2568,23 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, io_cqring_ev_posted(ctx); } +static void io_req_complete_post32(struct io_kiocb *req, s32 res, + u32 cflags, u64 extra1, u64 extra2) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock(&ctx->completion_lock); + __io_req_complete_post32(req, res, cflags, extra1, extra2); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); +} + static inline void io_req_complete_state(struct io_kiocb *req, s32 res, u32 cflags) { - req->result = res; - req->cflags = cflags; + req->cqe.res = res; + req->cqe.flags = cflags; req->flags |= REQ_F_COMPLETE_INLINE; } @@ -2120,8 +2597,23 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, io_req_complete_post(req, res, cflags); } +static inline void __io_req_complete32(struct io_kiocb *req, + unsigned int issue_flags, s32 res, + u32 cflags, u64 extra1, u64 extra2) +{ + if (issue_flags & IO_URING_F_COMPLETE_DEFER) { + io_req_complete_state(req, res, cflags); + req->extra1 = extra1; + req->extra2 = extra2; + } else { + io_req_complete_post32(req, res, cflags, extra1, extra2); + } +} + static inline void io_req_complete(struct io_kiocb *req, s32 res) { + if (res < 0) + req_set_fail(req); __io_req_complete(req, 0, res, 0); } @@ -2131,17 +2623,6 @@ static void io_req_complete_failed(struct io_kiocb *req, s32 res) io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); } -static void io_req_complete_fail_submit(struct io_kiocb *req) -{ - /* - * We don't submit, fail them all, for that replace hardlinks with - * normal links. Extra REQ_F_LINK is tolerated. - */ - req->flags &= ~REQ_F_HARDLINK; - req->flags |= REQ_F_LINK; - io_req_complete_failed(req, req->result); -} - /* * Don't initialise the fields below on every allocation, but do that in * advance and keep them valid across allocations. @@ -2152,7 +2633,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) req->link = NULL; req->async_data = NULL; /* not necessary, but safer to zero */ - req->result = 0; + req->cqe.res = 0; } static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, @@ -2164,19 +2645,9 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, spin_unlock(&ctx->completion_lock); } -/* Returns true IFF there are requests in the cache */ -static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) +static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) { - struct io_submit_state *state = &ctx->submit_state; - - /* - * If we have more than a batch's worth of requests in our IRQ side - * locked cache, grab the lock and move them over to our submission - * side cache. - */ - if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) - io_flush_cached_locked_reqs(ctx, state); - return !!state->free_list.next; + return !ctx->submit_state.free_list.next; } /* @@ -2188,14 +2659,20 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - struct io_submit_state *state = &ctx->submit_state; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; void *reqs[IO_REQ_ALLOC_BATCH]; - struct io_kiocb *req; int ret, i; - if (likely(state->free_list.next || io_flush_cached_reqs(ctx))) - return true; + /* + * If we have more than a batch's worth of requests in our IRQ side + * locked cache, grab the lock and move them over to our submission + * side cache. + */ + if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { + io_flush_cached_locked_reqs(ctx, &ctx->submit_state); + if (!io_req_cache_empty(ctx)) + return true; + } ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); @@ -2212,17 +2689,17 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) percpu_ref_get_many(&ctx->refs, ret); for (i = 0; i < ret; i++) { - req = reqs[i]; + struct io_kiocb *req = reqs[i]; io_preinit_req(req, ctx); - wq_stack_add_head(&req->comp_list, &state->free_list); + io_req_add_to_cache(req, ctx); } return true; } static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) { - if (unlikely(!ctx->submit_state.free_list.next)) + if (unlikely(io_req_cache_empty(ctx))) return __io_alloc_req_refill(ctx); return true; } @@ -2251,11 +2728,11 @@ static inline void io_dismantle_req(struct io_kiocb *req) io_put_file(req->file); } -static __cold void __io_free_req(struct io_kiocb *req) +static __cold void io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - io_req_put_rsrc(req, ctx); + io_req_put_rsrc(req); io_dismantle_req(req); io_put_task(req->task, 1); @@ -2273,7 +2750,7 @@ static inline void io_remove_next_linked(struct io_kiocb *req) nxt->link = NULL; } -static bool io_kill_linked_timeout(struct io_kiocb *req) +static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) __must_hold(&req->ctx->timeout_lock) { @@ -2286,13 +2763,10 @@ static bool io_kill_linked_timeout(struct io_kiocb *req) link->timeout.head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { list_del(&link->timeout.list); - /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ - io_fill_cqe_req(link, -ECANCELED, 0); - io_put_req_deferred(link); - return true; + return link; } } - return false; + return NULL; } static void io_fail_links(struct io_kiocb *req) @@ -2306,19 +2780,19 @@ static void io_fail_links(struct io_kiocb *req) long res = -ECANCELED; if (link->flags & REQ_F_FAIL) - res = link->result; + res = link->cqe.res; nxt = link->link; link->link = NULL; - trace_io_uring_fail_link(req->ctx, req, req->user_data, + trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data, req->opcode, link); - if (!ignore_cqes) { + if (ignore_cqes) + link->flags |= REQ_F_CQE_SKIP; + else link->flags &= ~REQ_F_CQE_SKIP; - io_fill_cqe_req(link, res, 0); - } - io_put_req_deferred(link); + __io_req_complete_post(link, res, 0); link = nxt; } } @@ -2326,25 +2800,27 @@ static void io_fail_links(struct io_kiocb *req) static bool io_disarm_next(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) { + struct io_kiocb *link = NULL; bool posted = false; if (req->flags & REQ_F_ARM_LTIMEOUT) { - struct io_kiocb *link = req->link; - + link = req->link; req->flags &= ~REQ_F_ARM_LTIMEOUT; if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { io_remove_next_linked(req); - /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ - io_fill_cqe_req(link, -ECANCELED, 0); - io_put_req_deferred(link); + io_req_tw_post_queue(link, -ECANCELED, 0); posted = true; } } else if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; spin_lock_irq(&ctx->timeout_lock); - posted = io_kill_linked_timeout(req); + link = io_disarm_linked_timeout(req); spin_unlock_irq(&ctx->timeout_lock); + if (link) { + posted = true; + io_req_tw_post_queue(link, -ECANCELED, 0); + } } if (unlikely((req->flags & REQ_F_FAIL) && !(req->flags & REQ_F_HARDLINK))) { @@ -2361,8 +2837,7 @@ static void __io_req_find_next_prep(struct io_kiocb *req) spin_lock(&ctx->completion_lock); posted = io_disarm_next(req); - if (posted) - io_commit_cqring(ctx); + io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); @@ -2372,8 +2847,6 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) { struct io_kiocb *nxt; - if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) - return NULL; /* * If LINK is set, we have dependent requests in this chain. If we * didn't fail this request, queue the first one up, moving any other @@ -2391,6 +2864,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) { if (!ctx) return; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (*locked) { io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); @@ -2434,7 +2909,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, if (likely(*uring_locked)) req->io_task_work.func(req, uring_locked); else - __io_req_complete_post(req, req->result, + __io_req_complete_post(req, req->cqe.res, io_put_kbuf_comp(req)); node = next; } while (node); @@ -2475,15 +2950,11 @@ static void tctx_task_work(struct callback_head *cb) while (1) { struct io_wq_work_node *node1, *node2; - if (!tctx->task_list.first && - !tctx->prior_task_list.first && uring_locked) - io_submit_flush_completions(ctx); - spin_lock_irq(&tctx->task_lock); - node1 = tctx->prior_task_list.first; + node1 = tctx->prio_task_list.first; node2 = tctx->task_list.first; INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prior_task_list); + INIT_WQ_LIST(&tctx->prio_task_list); if (!node2 && !node1) tctx->task_running = false; spin_unlock_irq(&tctx->task_lock); @@ -2492,10 +2963,13 @@ static void tctx_task_work(struct callback_head *cb) if (node1) handle_prev_tw_list(node1, &ctx, &uring_locked); - if (node2) handle_tw_list(node2, &ctx, &uring_locked); cond_resched(); + + if (data_race(!tctx->task_list.first) && + data_race(!tctx->prio_task_list.first) && uring_locked) + io_submit_flush_completions(ctx); } ctx_flush_and_put(ctx, &uring_locked); @@ -2505,24 +2979,19 @@ static void tctx_task_work(struct callback_head *cb) io_uring_drop_tctx_refs(current); } -static void io_req_task_work_add(struct io_kiocb *req, bool priority) +static void __io_req_task_work_add(struct io_kiocb *req, + struct io_uring_task *tctx, + struct io_wq_work_list *list) { - struct task_struct *tsk = req->task; - struct io_uring_task *tctx = tsk->io_uring; - enum task_work_notify_mode notify; + struct io_ring_ctx *ctx = req->ctx; struct io_wq_work_node *node; unsigned long flags; bool running; - WARN_ON_ONCE(!tctx); - io_drop_inflight_file(req); spin_lock_irqsave(&tctx->task_lock, flags); - if (priority) - wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list); - else - wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); + wq_list_add_tail(&req->io_task_work.node, list); running = tctx->task_running; if (!running) tctx->task_running = true; @@ -2532,22 +3001,15 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority) if (running) return; - /* - * SQPOLL kernel thread doesn't need notification, just a wakeup. For - * all other cases, use TWA_SIGNAL unconditionally to ensure we're - * processing task_work. There's no reliable way to tell if TWA_RESUME - * will do the job. - */ - notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; - if (likely(!task_work_add(tsk, &tctx->task_work, notify))) { - if (notify == TWA_NONE) - wake_up_process(tsk); + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + + if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) return; - } spin_lock_irqsave(&tctx->task_lock, flags); tctx->task_running = false; - node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list); + node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list); spin_unlock_irqrestore(&tctx->task_lock, flags); while (node) { @@ -2559,47 +3021,73 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority) } } -static void io_req_task_cancel(struct io_kiocb *req, bool *locked) +static void io_req_task_work_add(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; + struct io_uring_task *tctx = req->task->io_uring; + + __io_req_task_work_add(req, tctx, &tctx->task_list); +} + +static void io_req_task_prio_work_add(struct io_kiocb *req) +{ + struct io_uring_task *tctx = req->task->io_uring; + if (req->ctx->flags & IORING_SETUP_SQPOLL) + __io_req_task_work_add(req, tctx, &tctx->prio_task_list); + else + __io_req_task_work_add(req, tctx, &tctx->task_list); +} + +static void io_req_tw_post(struct io_kiocb *req, bool *locked) +{ + io_req_complete_post(req, req->cqe.res, req->cqe.flags); +} + +static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) +{ + req->cqe.res = res; + req->cqe.flags = cflags; + req->io_task_work.func = io_req_tw_post; + io_req_task_work_add(req); +} + +static void io_req_task_cancel(struct io_kiocb *req, bool *locked) +{ /* not needed for normal modes, but SQPOLL depends on it */ - io_tw_lock(ctx, locked); - io_req_complete_failed(req, req->result); + io_tw_lock(req->ctx, locked); + io_req_complete_failed(req, req->cqe.res); } static void io_req_task_submit(struct io_kiocb *req, bool *locked) { - struct io_ring_ctx *ctx = req->ctx; - - io_tw_lock(ctx, locked); + io_tw_lock(req->ctx, locked); /* req->task == current here, checking PF_EXITING is safe */ if (likely(!(req->task->flags & PF_EXITING))) - __io_queue_sqe(req); + io_queue_sqe(req); else io_req_complete_failed(req, -EFAULT); } static void io_req_task_queue_fail(struct io_kiocb *req, int ret) { - req->result = ret; + req->cqe.res = ret; req->io_task_work.func = io_req_task_cancel; - io_req_task_work_add(req, false); + io_req_task_work_add(req); } static void io_req_task_queue(struct io_kiocb *req) { req->io_task_work.func = io_req_task_submit; - io_req_task_work_add(req, false); + io_req_task_work_add(req); } static void io_req_task_queue_reissue(struct io_kiocb *req) { - req->io_task_work.func = io_queue_async_work; - io_req_task_work_add(req, false); + req->io_task_work.func = io_queue_iowq; + io_req_task_work_add(req); } -static inline void io_queue_next(struct io_kiocb *req) +static void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = io_req_find_next(req); @@ -2607,17 +3095,6 @@ static inline void io_queue_next(struct io_kiocb *req) io_req_task_queue(nxt); } -static void io_free_req(struct io_kiocb *req) -{ - io_queue_next(req); - __io_free_req(req); -} - -static void io_free_req_work(struct io_kiocb *req, bool *locked) -{ - io_free_req(req); -} - static void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) @@ -2629,15 +3106,30 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (unlikely(req->flags & REQ_F_REFCOUNT)) { - node = req->comp_list.next; - if (!req_ref_put_and_test(req)) - continue; + if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { + if (req->flags & REQ_F_REFCOUNT) { + node = req->comp_list.next; + if (!req_ref_put_and_test(req)) + continue; + } + if ((req->flags & REQ_F_POLLED) && req->apoll) { + struct async_poll *apoll = req->apoll; + + if (apoll->double_poll) + kfree(apoll->double_poll); + list_add(&apoll->poll.wait.entry, + &ctx->apoll_cache); + req->flags &= ~REQ_F_POLLED; + } + if (req->flags & IO_REQ_LINK_FLAGS) + io_queue_next(req); + if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) + io_clean_op(req); } + if (!(req->flags & REQ_F_FIXED_FILE)) + io_put_file(req->file); io_req_put_rsrc_locked(req, ctx); - io_queue_next(req); - io_dismantle_req(req); if (req->task != task) { if (task) @@ -2647,7 +3139,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, } task_refs++; node = req->comp_list.next; - wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); + io_req_add_to_cache(req, ctx); } while (node); if (task) @@ -2666,16 +3158,11 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(req, req->result, req->cflags); - if ((req->flags & REQ_F_POLLED) && req->apoll) { - struct async_poll *apoll = req->apoll; - - if (apoll->double_poll) - kfree(apoll->double_poll); - list_add(&apoll->poll.wait.entry, - &ctx->apoll_cache); - req->flags &= ~REQ_F_POLLED; + if (!(req->flags & REQ_F_CQE_SKIP)) { + if (!(ctx->flags & IORING_SETUP_CQE32)) + __io_fill_cqe_req_filled(ctx, req); + else + __io_fill_cqe32_req_filled(ctx, req); } } @@ -2698,23 +3185,18 @@ static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) struct io_kiocb *nxt = NULL; if (req_ref_put_and_test(req)) { - nxt = io_req_find_next(req); - __io_free_req(req); + if (unlikely(req->flags & IO_REQ_LINK_FLAGS)) + nxt = io_req_find_next(req); + io_free_req(req); } return nxt; } static inline void io_put_req(struct io_kiocb *req) { - if (req_ref_put_and_test(req)) - io_free_req(req); -} - -static inline void io_put_req_deferred(struct io_kiocb *req) -{ if (req_ref_put_and_test(req)) { - req->io_task_work.func = io_free_req_work; - io_req_task_work_add(req, false); + io_queue_next(req); + io_free_req(req); } } @@ -2800,7 +3282,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) nr_events++; if (unlikely(req->flags & REQ_F_CQE_SKIP)) continue; - __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0)); + __io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0)); } if (unlikely(!nr_events)) @@ -2846,22 +3328,26 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) { unsigned int nr_events = 0; int ret = 0; + unsigned long check_cq; /* - * We disallow the app entering submit/complete with polling, but we - * still need to lock the ring to prevent racing with polled issue - * that got punted to a workqueue. - */ - mutex_lock(&ctx->uring_lock); - /* * Don't enter poll loop if we already have events pending. * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ - if (test_bit(0, &ctx->check_cq_overflow)) + check_cq = READ_ONCE(ctx->check_cq); + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) __io_cqring_overflow_flush(ctx, false); if (io_cqring_events(ctx)) - goto out; + return 0; + + /* + * Similarly do not spin if we have not informed the user of any + * dropped CQE. + */ + if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) + return -EBADR; + do { /* * If a submit got punted to a workqueue, we can have the @@ -2891,8 +3377,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) nr_events += ret; ret = 0; } while (nr_events < min && !need_resched()); -out: - mutex_unlock(&ctx->uring_lock); + return ret; } @@ -2965,21 +3450,21 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) } else { fsnotify_access(req->file); } - if (unlikely(res != req->result)) { + if (unlikely(res != req->cqe.res)) { if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { req->flags |= REQ_F_REISSUE; return true; } req_set_fail(req); - req->result = res; + req->cqe.res = res; } return false; } static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { - int res = req->result; + int res = req->cqe.res; if (*locked) { io_req_complete_state(req, res, io_put_kbuf(req, 0)); @@ -2995,7 +3480,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, { if (__io_complete_rw_common(req, res)) return; - __io_req_complete(req, issue_flags, req->result, + __io_req_complete(req, issue_flags, req->cqe.res, io_put_kbuf(req, issue_flags)); } @@ -3005,9 +3490,9 @@ static void io_complete_rw(struct kiocb *kiocb, long res) if (__io_complete_rw_common(req, res)) return; - req->result = res; + req->cqe.res = res; req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL)); + io_req_task_prio_work_add(req); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) @@ -3016,12 +3501,12 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if (unlikely(res != req->result)) { + if (unlikely(res != req->cqe.res)) { if (res == -EAGAIN && io_rw_should_reissue(req)) { req->flags |= REQ_F_REISSUE; return; } - req->result = res; + req->cqe.res = res; } /* order with io_iopoll_complete() checking ->iopoll_completed */ @@ -3131,6 +3616,8 @@ static unsigned int io_file_get_flags(struct file *file) res |= FFS_ISREG; if (__io_file_supports_nowait(file, mode)) res |= FFS_NOWAIT; + if (io_file_need_scm(file)) + res |= FFS_SCM; return res; } @@ -3162,6 +3649,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); req->rw.flags = READ_ONCE(sqe->rw_flags); + /* used for fixed read/write too - just read unconditionally */ req->buf_index = READ_ONCE(sqe->buf_index); return 0; } @@ -3310,77 +3798,96 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, return __io_import_fixed(req, rw, iter, imu); } -static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) -{ - if (needs_lock) - mutex_unlock(&ctx->uring_lock); -} - -static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) +static int io_buffer_add_list(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned int bgid) { - /* - * "Normal" inline submissions always hold the uring_lock, since we - * grab it from the system call. Same is true for the SQPOLL offload. - * The only exception is when we've detached the request and issue it - * from an async worker thread, grab the lock for that case. - */ - if (needs_lock) - mutex_lock(&ctx->uring_lock); -} - -static void io_buffer_add_list(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned int bgid) -{ - struct list_head *list; - - list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; - INIT_LIST_HEAD(&bl->buf_list); bl->bgid = bgid; - list_add(&bl->list, list); + if (bgid < BGID_ARRAY) + return 0; + + return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } -static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, - int bgid, unsigned int issue_flags) +static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl) { - struct io_buffer *kbuf = req->kbuf; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - - if (req->flags & REQ_F_BUFFER_SELECTED) - return kbuf; + if (!list_empty(&bl->buf_list)) { + struct io_buffer *kbuf; - io_ring_submit_lock(ctx, needs_lock); - - lockdep_assert_held(&ctx->uring_lock); - - bl = io_buffer_get_list(ctx, bgid); - if (bl && !list_empty(&bl->buf_list)) { kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&kbuf->list); if (*len > kbuf->len) *len = kbuf->len; req->flags |= REQ_F_BUFFER_SELECTED; req->kbuf = kbuf; + req->buf_index = kbuf->bid; + return u64_to_user_ptr(kbuf->addr); + } + return NULL; +} + +static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl, + unsigned int issue_flags) +{ + struct io_uring_buf_ring *br = bl->buf_ring; + struct io_uring_buf *buf; + __u32 head = bl->head; + + if (unlikely(smp_load_acquire(&br->tail) == head)) { + io_ring_submit_unlock(req->ctx, issue_flags); + return NULL; + } + + head &= bl->mask; + if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { + buf = &br->bufs[head]; } else { - kbuf = ERR_PTR(-ENOBUFS); + int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); + int index = head / IO_BUFFER_LIST_BUF_PER_PAGE - 1; + buf = page_address(bl->buf_pages[index]); + buf += off; } + if (*len > buf->len) + *len = buf->len; + req->flags |= REQ_F_BUFFER_RING; + req->buf_list = bl; + req->buf_index = buf->bid; - io_ring_submit_unlock(req->ctx, needs_lock); - return kbuf; + if (issue_flags & IO_URING_F_UNLOCKED) { + /* + * If we came in unlocked, we have no choice but to consume the + * buffer here. This does mean it'll be pinned until the IO + * completes. But coming in unlocked means we're in io-wq + * context, hence there should be no further retry. For the + * locked case, the caller must ensure to call the commit when + * the transfer completes (or if we get -EAGAIN and must poll + * or retry). + */ + req->buf_list = NULL; + bl->head++; + } + return u64_to_user_ptr(buf->addr); } -static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) +static void __user *io_buffer_select(struct io_kiocb *req, size_t *len, + unsigned int issue_flags) { - struct io_buffer *kbuf; - u16 bgid; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + void __user *ret = NULL; + + io_ring_submit_lock(req->ctx, issue_flags); - bgid = req->buf_index; - kbuf = io_buffer_select(req, len, bgid, issue_flags); - if (IS_ERR(kbuf)) - return kbuf; - return u64_to_user_ptr(kbuf->addr); + bl = io_buffer_get_list(ctx, req->buf_index); + if (likely(bl)) { + if (bl->buf_nr_pages) + ret = io_ring_buffer_select(req, len, bl, issue_flags); + else + ret = io_provided_buffer_select(req, len, bl); + } + io_ring_submit_unlock(req->ctx, issue_flags); + return ret; } #ifdef CONFIG_COMPAT @@ -3390,7 +3897,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, struct compat_iovec __user *uiov; compat_ssize_t clen; void __user *buf; - ssize_t len; + size_t len; uiov = u64_to_user_ptr(req->rw.addr); if (!access_ok(uiov, sizeof(*uiov))) @@ -3401,11 +3908,12 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, return -EINVAL; len = clen; - buf = io_rw_buffer_select(req, &len, issue_flags); - if (IS_ERR(buf)) - return PTR_ERR(buf); + buf = io_buffer_select(req, &len, issue_flags); + if (!buf) + return -ENOBUFS; + req->rw.addr = (unsigned long) buf; iov[0].iov_base = buf; - iov[0].iov_len = (compat_size_t) len; + req->rw.len = iov[0].iov_len = (compat_size_t) len; return 0; } #endif @@ -3423,22 +3931,21 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, len = iov[0].iov_len; if (len < 0) return -EINVAL; - buf = io_rw_buffer_select(req, &len, issue_flags); - if (IS_ERR(buf)) - return PTR_ERR(buf); + buf = io_buffer_select(req, &len, issue_flags); + if (!buf) + return -ENOBUFS; + req->rw.addr = (unsigned long) buf; iov[0].iov_base = buf; - iov[0].iov_len = len; + req->rw.len = iov[0].iov_len = len; return 0; } static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) { - if (req->flags & REQ_F_BUFFER_SELECTED) { - struct io_buffer *kbuf = req->kbuf; - - iov[0].iov_base = u64_to_user_ptr(kbuf->addr); - iov[0].iov_len = kbuf->len; + if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { + iov[0].iov_base = u64_to_user_ptr(req->rw.addr); + iov[0].iov_len = req->rw.len; return 0; } if (req->rw.len != 1) @@ -3452,6 +3959,13 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, return __io_iov_buffer_select(req, iov, issue_flags); } +static inline bool io_do_buffer_select(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return false; + return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); +} + static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, struct io_rw_state *s, unsigned int issue_flags) @@ -3470,18 +3984,15 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, return NULL; } - /* buffer index only valid with fixed read/write, or buffer select */ - if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) - return ERR_PTR(-EINVAL); - buf = u64_to_user_ptr(req->rw.addr); sqe_len = req->rw.len; if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { - if (req->flags & REQ_F_BUFFER_SELECT) { - buf = io_rw_buffer_select(req, &sqe_len, issue_flags); - if (IS_ERR(buf)) - return ERR_CAST(buf); + if (io_do_buffer_select(req)) { + buf = io_buffer_select(req, &sqe_len, issue_flags); + if (!buf) + return ERR_PTR(-ENOBUFS); + req->rw.addr = (unsigned long) buf; req->rw.len = sqe_len; } @@ -3836,7 +4347,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) kfree(iovec); return ret; } - req->result = iov_iter_count(&s->iter); + req->cqe.res = iov_iter_count(&s->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ @@ -3852,7 +4363,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ppos = io_kiocb_update_pos(req); - ret = rw_verify_area(READ, req->file, ppos, req->result); + ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); if (unlikely(ret)) { kfree(iovec); return ret; @@ -3874,7 +4385,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ret = 0; } else if (ret == -EIOCBQUEUED) { goto out_free; - } else if (ret == req->result || ret <= 0 || !force_nonblock || + } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { /* read all, failed, already did sync or don't want to retry */ goto done; @@ -3964,7 +4475,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) kfree(iovec); return ret; } - req->result = iov_iter_count(&s->iter); + req->cqe.res = iov_iter_count(&s->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ @@ -3984,7 +4495,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) ppos = io_kiocb_update_pos(req); - ret = rw_verify_area(WRITE, req->file, ppos, req->result); + ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); if (unlikely(ret)) goto out_free; @@ -4048,9 +4559,7 @@ static int io_renameat_prep(struct io_kiocb *req, struct io_rename *ren = &req->rename; const char __user *oldf, *newf; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4087,22 +4596,257 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) ren->newpath, ren->flags); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } +static inline void __io_xattr_finish(struct io_kiocb *req) +{ + struct io_xattr *ix = &req->xattr; + + if (ix->filename) + putname(ix->filename); + + kfree(ix->ctx.kname); + kvfree(ix->ctx.kvalue); +} + +static void io_xattr_finish(struct io_kiocb *req, int ret) +{ + req->flags &= ~REQ_F_NEED_CLEANUP; + + __io_xattr_finish(req); + io_req_complete(req, ret); +} + +static int __io_getxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = &req->xattr; + const char __user *name; + int ret; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + ix->filename = NULL; + ix->ctx.kvalue = NULL; + name = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ix->ctx.size = READ_ONCE(sqe->len); + ix->ctx.flags = READ_ONCE(sqe->xattr_flags); + + if (ix->ctx.flags) + return -EINVAL; + + ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); + if (!ix->ctx.kname) + return -ENOMEM; + + ret = strncpy_from_user(ix->ctx.kname->name, name, + sizeof(ix->ctx.kname->name)); + if (!ret || ret == sizeof(ix->ctx.kname->name)) + ret = -ERANGE; + if (ret < 0) { + kfree(ix->ctx.kname); + return ret; + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +static int io_fgetxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + return __io_getxattr_prep(req, sqe); +} + +static int io_getxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = &req->xattr; + const char __user *path; + int ret; + + ret = __io_getxattr_prep(req, sqe); + if (ret) + return ret; + + path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + + ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); + if (IS_ERR(ix->filename)) { + ret = PTR_ERR(ix->filename); + ix->filename = NULL; + } + + return ret; +} + +static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_xattr *ix = &req->xattr; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt), + req->file->f_path.dentry, + &ix->ctx); + + io_xattr_finish(req, ret); + return 0; +} + +static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_xattr *ix = &req->xattr; + unsigned int lookup_flags = LOOKUP_FOLLOW; + struct path path; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + +retry: + ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); + if (!ret) { + ret = do_getxattr(mnt_user_ns(path.mnt), + path.dentry, + &ix->ctx); + + path_put(&path); + if (retry_estale(ret, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + } + + io_xattr_finish(req, ret); + return 0; +} + +static int __io_setxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = &req->xattr; + const char __user *name; + int ret; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + ix->filename = NULL; + name = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ix->ctx.kvalue = NULL; + ix->ctx.size = READ_ONCE(sqe->len); + ix->ctx.flags = READ_ONCE(sqe->xattr_flags); + + ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); + if (!ix->ctx.kname) + return -ENOMEM; + + ret = setxattr_copy(name, &ix->ctx); + if (ret) { + kfree(ix->ctx.kname); + return ret; + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +static int io_setxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = &req->xattr; + const char __user *path; + int ret; + + ret = __io_setxattr_prep(req, sqe); + if (ret) + return ret; + + path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + + ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); + if (IS_ERR(ix->filename)) { + ret = PTR_ERR(ix->filename); + ix->filename = NULL; + } + + return ret; +} + +static int io_fsetxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + return __io_setxattr_prep(req, sqe); +} + +static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags, + struct path *path) +{ + struct io_xattr *ix = &req->xattr; + int ret; + + ret = mnt_want_write(path->mnt); + if (!ret) { + ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx); + mnt_drop_write(path->mnt); + } + + return ret; +} + +static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = __io_setxattr(req, issue_flags, &req->file->f_path); + io_xattr_finish(req, ret); + + return 0; +} + +static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_xattr *ix = &req->xattr; + unsigned int lookup_flags = LOOKUP_FOLLOW; + struct path path; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + +retry: + ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); + if (!ret) { + ret = __io_setxattr(req, issue_flags, &path); + path_put(&path); + if (retry_estale(ret, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + } + + io_xattr_finish(req, ret); + return 0; +} + static int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_unlink *un = &req->unlink; const char __user *fname; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || - sqe->splice_fd_in) + if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4136,8 +4880,6 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) ret = do_unlinkat(un->dfd, un->filename); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -4148,10 +4890,7 @@ static int io_mkdirat_prep(struct io_kiocb *req, struct io_mkdir *mkd = &req->mkdir; const char __user *fname; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index || - sqe->splice_fd_in) + if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4179,8 +4918,6 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -4191,10 +4928,7 @@ static int io_symlinkat_prep(struct io_kiocb *req, struct io_symlink *sl = &req->symlink; const char __user *oldpath, *newpath; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index || - sqe->splice_fd_in) + if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4228,8 +4962,6 @@ static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -4240,9 +4972,7 @@ static int io_linkat_prep(struct io_kiocb *req, struct io_hardlink *lnk = &req->hardlink; const char __user *oldf, *newf; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4279,9 +5009,97 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) lnk->newpath, lnk->flags); req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_complete(req, ret); + return 0; +} + +static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) +{ + req->uring_cmd.task_work_cb(&req->uring_cmd); +} + +void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *)) +{ + struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); + + req->uring_cmd.task_work_cb = task_work_cb; + req->io_task_work.func = io_uring_cmd_work; + io_req_task_prio_work_add(req); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); + +/* + * Called by consumers of io_uring_cmd, if they originally returned + * -EIOCBQUEUED upon receiving the command. + */ +void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) +{ + struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); + if (ret < 0) req_set_fail(req); - io_req_complete(req, ret); + if (req->ctx->flags & IORING_SETUP_CQE32) + __io_req_complete32(req, 0, ret, 0, res2, 0); + else + io_req_complete(req, ret); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_done); + +static int io_uring_cmd_prep_async(struct io_kiocb *req) +{ + size_t cmd_size; + + cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); + + memcpy(req->async_data, req->uring_cmd.cmd, cmd_size); + return 0; +} + +static int io_uring_cmd_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_uring_cmd *ioucmd = &req->uring_cmd; + + if (sqe->rw_flags) + return -EINVAL; + ioucmd->cmd = sqe->cmd; + ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); + return 0; +} + +static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_uring_cmd *ioucmd = &req->uring_cmd; + struct io_ring_ctx *ctx = req->ctx; + struct file *file = req->file; + int ret; + + if (!req->file->f_op->uring_cmd) + return -EOPNOTSUPP; + + if (ctx->flags & IORING_SETUP_SQE128) + issue_flags |= IO_URING_F_SQE128; + if (ctx->flags & IORING_SETUP_CQE32) + issue_flags |= IO_URING_F_CQE32; + if (ctx->flags & IORING_SETUP_IOPOLL) + issue_flags |= IO_URING_F_IOPOLL; + + if (req_has_async_data(req)) + ioucmd->cmd = req->async_data; + + ret = file->f_op->uring_cmd(ioucmd, issue_flags); + if (ret == -EAGAIN) { + if (!req_has_async_data(req)) { + if (io_alloc_async_data(req)) + return -ENOMEM; + io_uring_cmd_prep_async(req); + } + return -EAGAIN; + } + + if (ret != -EIOCBQUEUED) + io_uring_cmd_done(ioucmd, ret, 0); return 0; } @@ -4289,9 +5107,7 @@ static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || + if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; @@ -4316,8 +5132,6 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) return -ENOTSOCK; ret = __sys_shutdown_sock(sock, req->shutdown.how); - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; #else @@ -4331,9 +5145,6 @@ static int __io_splice_prep(struct io_kiocb *req, struct io_splice *sp = &req->splice; unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - sp->len = READ_ONCE(sqe->len); sp->flags = READ_ONCE(sqe->splice_flags); if (unlikely(sp->flags & ~valid_flags)) @@ -4378,7 +5189,7 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags) done: if (ret != sp->len) req_set_fail(req); - io_req_complete(req, ret); + __io_req_complete(req, 0, ret, 0); return 0; } @@ -4423,7 +5234,20 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags) done: if (ret != sp->len) req_set_fail(req); - io_req_complete(req, ret); + __io_req_complete(req, 0, ret, 0); + return 0; +} + +static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + /* + * If the ring is setup with CQE32, relay back addr/addr + */ + if (req->ctx->flags & IORING_SETUP_CQE32) { + req->nop.extra1 = READ_ONCE(sqe->addr); + req->nop.extra2 = READ_ONCE(sqe->addr2); + } + return 0; } @@ -4432,20 +5256,31 @@ done: */ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) { - struct io_ring_ctx *ctx = req->ctx; + unsigned int cflags; + void __user *buf; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; + if (req->flags & REQ_F_BUFFER_SELECT) { + size_t len = 1; - __io_req_complete(req, issue_flags, 0, 0); + buf = io_buffer_select(req, &len, issue_flags); + if (!buf) + return -ENOBUFS; + } + + cflags = io_put_kbuf(req, issue_flags); + if (!(req->ctx->flags & IORING_SETUP_CQE32)) + __io_req_complete(req, issue_flags, 0, cflags); + else + __io_req_complete32(req, issue_flags, 0, cflags, + req->nop.extra1, req->nop.extra2); return 0; } static int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags || - sqe->splice_fd_in || sqe->buf_index || sqe->personality)) + if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in || + sqe->buf_index || sqe->personality)) return -EINVAL; req->msg.user_data = READ_ONCE(sqe->off); @@ -4481,17 +5316,15 @@ done: if (ret < 0) req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); + /* put file to avoid an attempt to IOPOLL the req */ + io_put_file(req->file); + req->file = NULL; return 0; } static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; - - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || - sqe->splice_fd_in)) + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; req->sync.flags = READ_ONCE(sqe->fsync_flags); @@ -4515,8 +5348,6 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fsync_range(req->file, req->sync.off, end > 0 ? end : LLONG_MAX, req->sync.flags & IORING_FSYNC_DATASYNC); - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -4524,10 +5355,7 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) static int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->ioprio || sqe->buf_index || sqe->rw_flags || - sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; req->sync.off = READ_ONCE(sqe->off); @@ -4545,9 +5373,7 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, req->sync.len); - if (ret < 0) - req_set_fail(req); - else + if (ret >= 0) fsnotify_modify(req->file); io_req_complete(req, ret); return 0; @@ -4558,9 +5384,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe const char __user *fname; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->ioprio || sqe->buf_index)) + if (unlikely(sqe->buf_index)) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4615,6 +5439,61 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return __io_openat_prep(req, sqe); } +static int io_file_bitmap_get(struct io_ring_ctx *ctx) +{ + struct io_file_table *table = &ctx->file_table; + unsigned long nr = ctx->nr_user_files; + int ret; + + if (table->alloc_hint >= nr) + table->alloc_hint = 0; + + do { + ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint); + if (ret != nr) { + table->alloc_hint = ret + 1; + return ret; + } + if (!table->alloc_hint) + break; + + nr = table->alloc_hint; + table->alloc_hint = 0; + } while (1); + + return -ENFILE; +} + +static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, + struct file *file, unsigned int file_slot) +{ + bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC; + struct io_ring_ctx *ctx = req->ctx; + int ret; + + if (alloc_slot) { + io_ring_submit_lock(ctx, issue_flags); + ret = io_file_bitmap_get(ctx); + if (unlikely(ret < 0)) { + io_ring_submit_unlock(ctx, issue_flags); + return ret; + } + + file_slot = ret; + } else { + file_slot--; + } + + ret = io_install_fixed_file(req, file, issue_flags, file_slot); + if (alloc_slot) { + io_ring_submit_unlock(ctx, issue_flags); + if (!ret) + return file_slot; + } + + return ret; +} + static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) { struct open_flags op; @@ -4670,8 +5549,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) if (!fixed) fd_install(ret, file); else - ret = io_install_fixed_file(req, file, issue_flags, - req->open.file_slot - 1); + ret = io_fixed_fd_install(req, issue_flags, file, + req->open.file_slot); err: putname(req->open.filename); req->flags &= ~REQ_F_NEED_CLEANUP; @@ -4692,7 +5571,7 @@ static int io_remove_buffers_prep(struct io_kiocb *req, struct io_provide_buf *p = &req->pbuf; u64 tmp; - if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off || + if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || sqe->splice_fd_in) return -EINVAL; @@ -4715,6 +5594,20 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (!nbufs) return 0; + if (bl->buf_nr_pages) { + int j; + + i = bl->buf_ring->tail - bl->head; + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + kvfree(bl->buf_pages); + bl->buf_pages = NULL; + bl->buf_nr_pages = 0; + /* make sure it's seen as empty */ + INIT_LIST_HEAD(&bl->buf_list); + return i; + } + /* the head kbuf is the list itself */ while (!list_empty(&bl->buf_list)) { struct io_buffer *nxt; @@ -4736,22 +5629,23 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; int ret = 0; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - - io_ring_submit_lock(ctx, needs_lock); - lockdep_assert_held(&ctx->uring_lock); + io_ring_submit_lock(ctx, issue_flags); ret = -ENOENT; bl = io_buffer_get_list(ctx, p->bgid); - if (bl) - ret = __io_remove_buffers(ctx, bl, p->nbufs); + if (bl) { + ret = -EINVAL; + /* can't use provide/remove buffers command on mapped buffers */ + if (!bl->buf_nr_pages) + ret = __io_remove_buffers(ctx, bl, p->nbufs); + } if (ret < 0) req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); return 0; } @@ -4762,7 +5656,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req, struct io_provide_buf *p = &req->pbuf; u64 tmp; - if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) + if (sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; tmp = READ_ONCE(sqe->fd); @@ -4859,26 +5753,56 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, return i ? 0 : -ENOMEM; } +static __cold int io_init_bl_list(struct io_ring_ctx *ctx) +{ + int i; + + ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), + GFP_KERNEL); + if (!ctx->io_bl) + return -ENOMEM; + + for (i = 0; i < BGID_ARRAY; i++) { + INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); + ctx->io_bl[i].bgid = i; + } + + return 0; +} + static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; int ret = 0; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - io_ring_submit_lock(ctx, needs_lock); + io_ring_submit_lock(ctx, issue_flags); - lockdep_assert_held(&ctx->uring_lock); + if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { + ret = io_init_bl_list(ctx); + if (ret) + goto err; + } bl = io_buffer_get_list(ctx, p->bgid); if (unlikely(!bl)) { - bl = kmalloc(sizeof(*bl), GFP_KERNEL); + bl = kzalloc(sizeof(*bl), GFP_KERNEL); if (!bl) { ret = -ENOMEM; goto err; } - io_buffer_add_list(ctx, bl, p->bgid); + INIT_LIST_HEAD(&bl->buf_list); + ret = io_buffer_add_list(ctx, bl, p->bgid); + if (ret) { + kfree(bl); + goto err; + } + } + /* can't add buffers via this command for a mapped buffer ring */ + if (bl->buf_nr_pages) { + ret = -EINVAL; + goto err; } ret = io_add_buffers(ctx, p, bl); @@ -4887,7 +5811,7 @@ err: req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); return 0; } @@ -4895,9 +5819,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_EPOLL) - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; req->epoll.epfd = READ_ONCE(sqe->fd); @@ -4941,9 +5863,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (sqe->buf_index || sqe->off || sqe->splice_fd_in) return -EINVAL; req->madvise.addr = READ_ONCE(sqe->addr); @@ -4965,8 +5885,6 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; #else @@ -4976,9 +5894,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) return -EINVAL; req->fadvise.offset = READ_ONCE(sqe->off); @@ -5014,9 +5930,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { const char __user *path; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; @@ -5052,19 +5966,13 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags) ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask, ctx->buffer); - - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || - sqe->rw_flags || sqe->buf_index) + if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; @@ -5096,7 +6004,8 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) spin_unlock(&files->file_lock); goto err; } - file = fdt->fd[close->fd]; + file = rcu_dereference_protected(fdt->fd[close->fd], + lockdep_is_held(&files->file_lock)); if (!file || file->f_op == &io_uring_fops) { spin_unlock(&files->file_lock); file = NULL; @@ -5130,12 +6039,7 @@ err: static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; - - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || - sqe->splice_fd_in)) + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; req->sync.off = READ_ONCE(sqe->off); @@ -5154,13 +6058,18 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) ret = sync_file_range(req->file, req->sync.off, req->sync.len, req->sync.flags); - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } #if defined(CONFIG_NET) +static bool io_net_retry(struct socket *sock, int flags) +{ + if (!(flags & MSG_WAITALL)) + return false; + return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; +} + static int io_setup_async_msg(struct io_kiocb *req, struct io_async_msghdr *kmsg) { @@ -5206,13 +6115,16 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = &req->sr_msg; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(sqe->file_index)) return -EINVAL; if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); + sr->flags = READ_ONCE(sqe->addr2); + if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; @@ -5221,12 +6133,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif + sr->done_io = 0; return 0; } static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr iomsg, *kmsg; + struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; unsigned flags; int min_ret = 0; @@ -5245,7 +6159,11 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg = &iomsg; } - flags = req->sr_msg.msg_flags; + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return io_setup_async_msg(req, kmsg); + + flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) @@ -5258,12 +6176,21 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return io_setup_async_msg(req, kmsg); + } req_set_fail(req); } /* fast path, check for non-NULL to avoid function call */ if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -5278,6 +6205,10 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) int min_ret = 0; int ret; + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; @@ -5291,7 +6222,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) msg.msg_controllen = 0; msg.msg_namelen = 0; - flags = req->sr_msg.msg_flags; + flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) @@ -5304,8 +6235,19 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->len -= ret; + sr->buf += ret; + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return -EAGAIN; + } req_set_fail(req); } + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -5397,14 +6339,6 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, return __io_recvmsg_copy_hdr(req, iomsg); } -static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, - unsigned int issue_flags) -{ - struct io_sr_msg *sr = &req->sr_msg; - - return io_buffer_select(req, &sr->len, sr->bgid, issue_flags); -} - static int io_recvmsg_prep_async(struct io_kiocb *req) { int ret; @@ -5419,14 +6353,16 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = &req->sr_msg; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(sqe->file_index)) return -EINVAL; if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); - sr->bgid = READ_ONCE(sqe->buf_group); + sr->flags = READ_ONCE(sqe->addr2); + if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; @@ -5439,19 +6375,12 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static bool io_net_retry(struct socket *sock, int flags) -{ - if (!(flags & MSG_WAITALL)) - return false; - return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; -} - static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr iomsg, *kmsg; struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; - struct io_buffer *kbuf; + unsigned int cflags; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; @@ -5469,24 +6398,30 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg = &iomsg; } - if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, issue_flags); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); - kmsg->fast_iov[0].iov_len = req->sr_msg.len; - iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, - 1, req->sr_msg.len); + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return io_setup_async_msg(req, kmsg); + + if (io_do_buffer_select(req)) { + void __user *buf; + + buf = io_buffer_select(req, &sr->len, issue_flags); + if (!buf) + return -ENOBUFS; + kmsg->fast_iov[0].iov_base = buf; + kmsg->fast_iov[0].iov_len = sr->len; + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, + sr->len); } - flags = req->sr_msg.msg_flags; + flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); - ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, - kmsg->uaddr, flags); + kmsg->msg.msg_get_inq = 1; + ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) return io_setup_async_msg(req, kmsg); @@ -5510,45 +6445,54 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; - __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); + cflags = io_put_kbuf(req, issue_flags); + if (kmsg->msg.msg_inq) + cflags |= IORING_CQE_F_SOCK_NONEMPTY; + __io_req_complete(req, issue_flags, ret, cflags); return 0; } static int io_recv(struct io_kiocb *req, unsigned int issue_flags) { - struct io_buffer *kbuf; struct io_sr_msg *sr = &req->sr_msg; struct msghdr msg; - void __user *buf = sr->buf; struct socket *sock; struct iovec iov; + unsigned int cflags; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; - if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, issue_flags); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - buf = u64_to_user_ptr(kbuf->addr); + if (io_do_buffer_select(req)) { + void __user *buf; + + buf = io_buffer_select(req, &sr->len, issue_flags); + if (!buf) + return -ENOBUFS; + sr->buf = buf; } - ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); + ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter); if (unlikely(ret)) goto out_free; msg.msg_name = NULL; + msg.msg_namelen = 0; msg.msg_control = NULL; + msg.msg_get_inq = 1; + msg.msg_flags = 0; msg.msg_controllen = 0; - msg.msg_namelen = 0; msg.msg_iocb = NULL; - msg.msg_flags = 0; - flags = req->sr_msg.msg_flags; + flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) @@ -5577,36 +6521,49 @@ out_free: ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; - __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); + cflags = io_put_kbuf(req, issue_flags); + if (msg.msg_inq) + cflags |= IORING_CQE_F_SOCK_NONEMPTY; + __io_req_complete(req, issue_flags, ret, cflags); return 0; } static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = &req->accept; + unsigned flags; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->buf_index) + if (sqe->len || sqe->buf_index) return -EINVAL; accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); + flags = READ_ONCE(sqe->ioprio); + if (flags & ~IORING_ACCEPT_MULTISHOT) + return -EINVAL; accept->file_slot = READ_ONCE(sqe->file_index); - if (accept->file_slot && (accept->flags & SOCK_CLOEXEC)) - return -EINVAL; + if (accept->file_slot) { + if (accept->flags & SOCK_CLOEXEC) + return -EINVAL; + if (flags & IORING_ACCEPT_MULTISHOT && + accept->file_slot != IORING_FILE_INDEX_ALLOC) + return -EINVAL; + } if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + if (flags & IORING_ACCEPT_MULTISHOT) + req->flags |= REQ_F_APOLL_MULTISHOT; return 0; } static int io_accept(struct io_kiocb *req, unsigned int issue_flags) { + struct io_ring_ctx *ctx = req->ctx; struct io_accept *accept = &req->accept; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; @@ -5614,6 +6571,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) struct file *file; int ret, fd; +retry: if (!fixed) { fd = __get_unused_fd_flags(accept->flags, accept->nofile); if (unlikely(fd < 0)) @@ -5625,7 +6583,89 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) if (!fixed) put_unused_fd(fd); ret = PTR_ERR(file); - if (ret == -EAGAIN && force_nonblock) + if (ret == -EAGAIN && force_nonblock) { + /* + * if it's multishot and polled, we don't need to + * return EAGAIN to arm the poll infra since it + * has already been done + */ + if ((req->flags & IO_APOLL_MULTI_POLLED) == + IO_APOLL_MULTI_POLLED) + ret = 0; + return ret; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; + req_set_fail(req); + } else if (!fixed) { + fd_install(fd, file); + ret = fd; + } else { + ret = io_fixed_fd_install(req, issue_flags, file, + accept->file_slot); + } + + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + __io_req_complete(req, issue_flags, ret, 0); + return 0; + } + if (ret >= 0) { + bool filled; + + spin_lock(&ctx->completion_lock); + filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret, + IORING_CQE_F_MORE); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + if (filled) { + io_cqring_ev_posted(ctx); + goto retry; + } + ret = -ECANCELED; + } + + return ret; +} + +static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_socket *sock = &req->sock; + + if (sqe->addr || sqe->rw_flags || sqe->buf_index) + return -EINVAL; + + sock->domain = READ_ONCE(sqe->fd); + sock->type = READ_ONCE(sqe->off); + sock->protocol = READ_ONCE(sqe->len); + sock->file_slot = READ_ONCE(sqe->file_index); + sock->nofile = rlimit(RLIMIT_NOFILE); + + sock->flags = sock->type & ~SOCK_TYPE_MASK; + if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) + return -EINVAL; + if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + return 0; +} + +static int io_socket(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_socket *sock = &req->sock; + bool fixed = !!sock->file_slot; + struct file *file; + int ret, fd; + + if (!fixed) { + fd = __get_unused_fd_flags(sock->flags, sock->nofile); + if (unlikely(fd < 0)) + return fd; + } + file = __sys_socket_file(sock->domain, sock->type, sock->protocol); + if (IS_ERR(file)) { + if (!fixed) + put_unused_fd(fd); + ret = PTR_ERR(file); + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; @@ -5635,7 +6675,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) ret = fd; } else { ret = io_install_fixed_file(req, file, issue_flags, - accept->file_slot - 1); + sock->file_slot - 1); } __io_req_complete(req, issue_flags, ret, 0); return 0; @@ -5653,10 +6693,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = &req->connect; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags || - sqe->splice_fd_in) + if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -5729,6 +6766,7 @@ IO_NETOP_PREP_ASYNC(sendmsg); IO_NETOP_PREP_ASYNC(recvmsg); IO_NETOP_PREP_ASYNC(connect); IO_NETOP_PREP(accept); +IO_NETOP_PREP(socket); IO_NETOP_FN(send); IO_NETOP_FN(recv); #endif /* CONFIG_NET */ @@ -5779,7 +6817,7 @@ static void io_poll_req_insert(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; struct hlist_head *list; - list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)]; hlist_add_head(&req->hash_node, list); } @@ -5838,22 +6876,23 @@ static void io_poll_remove_entries(struct io_kiocb *req) rcu_read_unlock(); } +static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags); /* * All poll tw should go through this. Checks for poll events, manages * references, does rewait, etc. * * Returns a negative error on failure. >0 when no action require, which is * either spurious wakeup or multishot CQE is served. 0 when it's done with - * the request, then the mask is stored in req->result. + * the request, then the mask is stored in req->cqe.res. */ -static int io_poll_check_events(struct io_kiocb *req, bool locked) +static int io_poll_check_events(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; - int v; + int v, ret; /* req->task == current here, checking PF_EXITING is safe */ if (unlikely(req->task->flags & PF_EXITING)) - io_poll_mark_cancelled(req); + return -ECANCELED; do { v = atomic_read(&req->poll_refs); @@ -5864,32 +6903,46 @@ static int io_poll_check_events(struct io_kiocb *req, bool locked) if (v & IO_POLL_CANCEL_FLAG) return -ECANCELED; - if (!req->result) { + if (!req->cqe.res) { struct poll_table_struct pt = { ._key = req->apoll_events }; unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED; if (unlikely(!io_assign_file(req, flags))) return -EBADF; - req->result = vfs_poll(req->file, &pt) & req->apoll_events; + req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; } - /* multishot, just fill an CQE and proceed */ - if (req->result && !(req->apoll_events & EPOLLONESHOT)) { - __poll_t mask = mangle_poll(req->result & req->apoll_events); + if ((unlikely(!req->cqe.res))) + continue; + if (req->apoll_events & EPOLLONESHOT) + return 0; + + /* multishot, just fill a CQE and proceed */ + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + __poll_t mask = mangle_poll(req->cqe.res & + req->apoll_events); bool filled; spin_lock(&ctx->completion_lock); - filled = io_fill_cqe_aux(ctx, req->user_data, mask, - IORING_CQE_F_MORE); + filled = io_fill_cqe_aux(ctx, req->cqe.user_data, + mask, IORING_CQE_F_MORE); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - if (unlikely(!filled)) - return -ECANCELED; - io_cqring_ev_posted(ctx); - } else if (req->result) { - return 0; + if (filled) { + io_cqring_ev_posted(ctx); + continue; + } + return -ECANCELED; } + io_tw_lock(req->ctx, locked); + if (unlikely(req->task->flags & PF_EXITING)) + return -EFAULT; + ret = io_issue_sqe(req, + IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); + if (ret) + return ret; + /* * Release all references, retry if someone tried to restart * task_work while we were executing it. @@ -5904,21 +6957,21 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = io_poll_check_events(req, *locked); + ret = io_poll_check_events(req, locked); if (ret > 0) return; if (!ret) { - req->result = mangle_poll(req->result & req->poll.events); + req->cqe.res = mangle_poll(req->cqe.res & req->poll.events); } else { - req->result = ret; + req->cqe.res = ret; req_set_fail(req); } io_poll_remove_entries(req); spin_lock(&ctx->completion_lock); hash_del(&req->hash_node); - __io_req_complete_post(req, req->result, 0); + __io_req_complete_post(req, req->cqe.res, 0); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -5929,7 +6982,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = io_poll_check_events(req, *locked); + ret = io_poll_check_events(req, locked); if (ret > 0) return; @@ -5944,9 +6997,9 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) io_req_complete_failed(req, ret); } -static void __io_poll_execute(struct io_kiocb *req, int mask, int events) +static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events) { - req->result = mask; + req->cqe.res = mask; /* * This is useful for poll that is armed on behalf of another * request, and where the wakeup path could be on a different @@ -5959,11 +7012,12 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, int events) else req->io_task_work.func = io_apoll_task_func; - trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask); - io_req_task_work_add(req, false); + trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask); + io_req_task_work_add(req); } -static inline void io_poll_execute(struct io_kiocb *req, int res, int events) +static inline void io_poll_execute(struct io_kiocb *req, int res, + __poll_t events) { if (io_poll_get_ownership(req)) __io_poll_execute(req, res, events); @@ -5978,6 +7032,7 @@ static void io_poll_cancel_req(struct io_kiocb *req) #define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1)) #define wqe_is_double(wait) ((unsigned long) (wait)->private & 1) +#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) @@ -6012,7 +7067,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, } /* for instances that support it check for an event match first */ - if (mask && !(mask & poll->events)) + if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON))) return 0; if (io_poll_get_ownership(req)) { @@ -6098,6 +7153,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, int v; INIT_HLIST_NODE(&req->hash_node); + req->work.cancel_seq = atomic_read(&ctx->cancel_seq); io_init_poll_iocb(poll, mask, io_poll_wake); poll->file = req->file; @@ -6168,28 +7224,34 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) struct io_ring_ctx *ctx = req->ctx; struct async_poll *apoll; struct io_poll_table ipt; - __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI; + __poll_t mask = POLLPRI | POLLERR; int ret; if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; - if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED)) + if (!file_can_poll(req->file)) return IO_APOLL_ABORTED; + if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) + return IO_APOLL_ABORTED; + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) + mask |= EPOLLONESHOT; if (def->pollin) { - mask |= POLLIN | POLLRDNORM; + mask |= EPOLLIN | EPOLLRDNORM; /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ if ((req->opcode == IORING_OP_RECVMSG) && (req->sr_msg.msg_flags & MSG_ERRQUEUE)) - mask &= ~POLLIN; + mask &= ~EPOLLIN; } else { - mask |= POLLOUT | POLLWRNORM; + mask |= EPOLLOUT | EPOLLWRNORM; } if (def->poll_exclusive) mask |= EPOLLEXCLUSIVE; - if (!(issue_flags & IO_URING_F_UNLOCKED) && - !list_empty(&ctx->apoll_cache)) { + if (req->flags & REQ_F_POLLED) { + apoll = req->apoll; + } else if (!(issue_flags & IO_URING_F_UNLOCKED) && + !list_empty(&ctx->apoll_cache)) { apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, poll.wait.entry); list_del_init(&apoll->poll.wait.entry); @@ -6209,7 +7271,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) if (ret || ipt.error) return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; - trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode, + trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode, mask, apoll->poll.events); return IO_APOLL_OK; } @@ -6242,24 +7304,53 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, return found; } -static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, - bool poll_only) +static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, + struct io_cancel_data *cd) __must_hold(&ctx->completion_lock) { struct hlist_head *list; struct io_kiocb *req; - list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; + list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)]; hlist_for_each_entry(req, list, hash_node) { - if (sqe_addr != req->user_data) + if (cd->data != req->cqe.user_data) continue; if (poll_only && req->opcode != IORING_OP_POLL_ADD) continue; + if (cd->flags & IORING_ASYNC_CANCEL_ALL) { + if (cd->seq == req->work.cancel_seq) + continue; + req->work.cancel_seq = cd->seq; + } return req; } return NULL; } +static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, + struct io_cancel_data *cd) + __must_hold(&ctx->completion_lock) +{ + struct io_kiocb *req; + int i; + + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry(req, list, hash_node) { + if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && + req->file != cd->file) + continue; + if (cd->seq == req->work.cancel_seq) + continue; + req->work.cancel_seq = cd->seq; + return req; + } + } + return NULL; +} + static bool io_poll_disarm(struct io_kiocb *req) __must_hold(&ctx->completion_lock) { @@ -6270,12 +7361,15 @@ static bool io_poll_disarm(struct io_kiocb *req) return true; } -static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr, - bool poll_only) +static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) __must_hold(&ctx->completion_lock) { - struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only); + struct io_kiocb *req; + if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) + req = io_poll_file_find(ctx, cd); + else + req = io_poll_find(ctx, false, cd); if (!req) return -ENOENT; io_poll_cancel_req(req); @@ -6302,9 +7396,7 @@ static int io_poll_update_prep(struct io_kiocb *req, struct io_poll_update *upd = &req->poll_update; u32 flags; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | @@ -6334,9 +7426,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe struct io_poll_iocb *poll = &req->poll; u32 flags; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr) + if (sqe->buf_index || sqe->off || sqe->addr) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~IORING_POLL_ADD_MULTI) @@ -6366,13 +7456,14 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) { + struct io_cancel_data cd = { .data = req->poll_update.old_user_data, }; struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *preq; int ret2, ret = 0; bool locked; spin_lock(&ctx->completion_lock); - preq = io_poll_find(ctx, req->poll_update.old_user_data, true); + preq = io_poll_find(ctx, true, &cd); if (!preq || !io_poll_disarm(preq)) { spin_unlock(&ctx->completion_lock); ret = preq ? -EALREADY : -ENOENT; @@ -6388,7 +7479,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) preq->poll.events |= IO_POLL_UNMASK; } if (req->poll_update.update_user_data) - preq->user_data = req->poll_update.new_user_data; + preq->cqe.user_data = req->poll_update.new_user_data; ret2 = io_poll_add(preq, issue_flags); /* successfully updated, don't complete poll request */ @@ -6397,7 +7488,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) } req_set_fail(preq); - preq->result = -ECANCELED; + preq->cqe.res = -ECANCELED; locked = !(issue_flags & IO_URING_F_UNLOCKED); io_req_task_complete(preq, &locked); out: @@ -6425,14 +7516,14 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) req_set_fail(req); - req->result = -ETIME; + req->cqe.res = -ETIME; req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req, false); + io_req_task_work_add(req); return HRTIMER_NORESTART; } static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, - __u64 user_data) + struct io_cancel_data *cd) __must_hold(&ctx->timeout_lock) { struct io_timeout_data *io; @@ -6440,9 +7531,16 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, bool found = false; list_for_each_entry(req, &ctx->timeout_list, timeout.list) { - found = user_data == req->user_data; - if (found) - break; + if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && + cd->data != req->cqe.user_data) + continue; + if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { + if (cd->seq == req->work.cancel_seq) + continue; + req->work.cancel_seq = cd->seq; + } + found = true; + break; } if (!found) return ERR_PTR(-ENOENT); @@ -6454,11 +7552,14 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, return req; } -static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) +static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) __must_hold(&ctx->completion_lock) - __must_hold(&ctx->timeout_lock) { - struct io_kiocb *req = io_timeout_extract(ctx, user_data); + struct io_kiocb *req; + + spin_lock_irq(&ctx->timeout_lock); + req = io_timeout_extract(ctx, cd); + spin_unlock_irq(&ctx->timeout_lock); if (IS_ERR(req)) return PTR_ERR(req); @@ -6491,7 +7592,7 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, bool found = false; list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { - found = user_data == req->user_data; + found = user_data == req->cqe.user_data; if (found) break; } @@ -6511,7 +7612,8 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { - struct io_kiocb *req = io_timeout_extract(ctx, user_data); + struct io_cancel_data cd = { .data = user_data, }; + struct io_kiocb *req = io_timeout_extract(ctx, &cd); struct io_timeout_data *data; if (IS_ERR(req)) @@ -6531,11 +7633,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req, { struct io_timeout_rem *tr = &req->timeout_rem; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) + if (sqe->buf_index || sqe->len || sqe->splice_fd_in) return -EINVAL; tr->ltimeout = false; @@ -6576,10 +7676,10 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) int ret; if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { + struct io_cancel_data cd = { .data = tr->addr, }; + spin_lock(&ctx->completion_lock); - spin_lock_irq(&ctx->timeout_lock); - ret = io_timeout_cancel(ctx, tr->addr); - spin_unlock_irq(&ctx->timeout_lock); + ret = io_timeout_cancel(ctx, &cd); spin_unlock(&ctx->completion_lock); } else { enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); @@ -6605,10 +7705,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, unsigned flags; u32 off = READ_ONCE(sqe->off); - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || - sqe->splice_fd_in) + if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) return -EINVAL; if (off && is_timeout_link) return -EINVAL; @@ -6707,30 +7804,42 @@ add: return 0; } -struct io_cancel_data { - struct io_ring_ctx *ctx; - u64 user_data; -}; - static bool io_cancel_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_cancel_data *cd = data; - return req->ctx == cd->ctx && req->user_data == cd->user_data; + if (req->ctx != cd->ctx) + return false; + if (cd->flags & IORING_ASYNC_CANCEL_ANY) { + ; + } else if (cd->flags & IORING_ASYNC_CANCEL_FD) { + if (req->file != cd->file) + return false; + } else { + if (req->cqe.user_data != cd->data) + return false; + } + if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { + if (cd->seq == req->work.cancel_seq) + return false; + req->work.cancel_seq = cd->seq; + } + return true; } -static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, - struct io_ring_ctx *ctx) +static int io_async_cancel_one(struct io_uring_task *tctx, + struct io_cancel_data *cd) { - struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; enum io_wq_cancel cancel_ret; int ret = 0; + bool all; if (!tctx || !tctx->io_wq) return -ENOENT; - cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false); + all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); + cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; @@ -6746,14 +7855,14 @@ static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, return ret; } -static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) +static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) { struct io_ring_ctx *ctx = req->ctx; int ret; WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); - ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); + ret = io_async_cancel_one(req->task->io_uring, cd); /* * Fall-through even for -EALREADY, as we may have poll armed * that need unarming. @@ -6762,56 +7871,98 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) return 0; spin_lock(&ctx->completion_lock); - ret = io_poll_cancel(ctx, sqe_addr, false); + ret = io_poll_cancel(ctx, cd); if (ret != -ENOENT) goto out; - - spin_lock_irq(&ctx->timeout_lock); - ret = io_timeout_cancel(ctx, sqe_addr); - spin_unlock_irq(&ctx->timeout_lock); + if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) + ret = io_timeout_cancel(ctx, cd); out: spin_unlock(&ctx->completion_lock); return ret; } +#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ + IORING_ASYNC_CANCEL_ANY) + static int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags || - sqe->splice_fd_in) + if (sqe->off || sqe->len || sqe->splice_fd_in) return -EINVAL; req->cancel.addr = READ_ONCE(sqe->addr); + req->cancel.flags = READ_ONCE(sqe->cancel_flags); + if (req->cancel.flags & ~CANCEL_FLAGS) + return -EINVAL; + if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) { + if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY) + return -EINVAL; + req->cancel.fd = READ_ONCE(sqe->fd); + } + return 0; } -static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) +static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, + unsigned int issue_flags) { - struct io_ring_ctx *ctx = req->ctx; - u64 sqe_addr = req->cancel.addr; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; + bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); + struct io_ring_ctx *ctx = cd->ctx; struct io_tctx_node *node; - int ret; + int ret, nr = 0; - ret = io_try_cancel_userdata(req, sqe_addr); - if (ret != -ENOENT) - goto done; + do { + ret = io_try_cancel(req, cd); + if (ret == -ENOENT) + break; + if (!all) + return ret; + nr++; + } while (1); /* slow path, try all io-wq's */ - io_ring_submit_lock(ctx, needs_lock); + io_ring_submit_lock(ctx, issue_flags); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { struct io_uring_task *tctx = node->task->io_uring; - ret = io_async_cancel_one(tctx, req->cancel.addr, ctx); - if (ret != -ENOENT) - break; + ret = io_async_cancel_one(tctx, cd); + if (ret != -ENOENT) { + if (!all) + break; + nr++; + } } - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); + return all ? nr : ret; +} + +static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_cancel_data cd = { + .ctx = req->ctx, + .data = req->cancel.addr, + .flags = req->cancel.flags, + .seq = atomic_inc_return(&req->ctx->cancel_seq), + }; + int ret; + + if (cd.flags & IORING_ASYNC_CANCEL_FD) { + if (req->flags & REQ_F_FIXED_FILE) + req->file = io_file_get_fixed(req, req->cancel.fd, + issue_flags); + else + req->file = io_file_get_normal(req, req->cancel.fd); + if (!req->file) { + ret = -EBADF; + goto done; + } + cd.file = req->file; + } + + ret = __io_async_cancel(&cd, req, issue_flags); done: if (ret < 0) req_set_fail(req); @@ -6824,7 +7975,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req, { if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) + if (sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; req->rsrc_update.offset = READ_ONCE(sqe->off); @@ -6838,7 +7989,6 @@ static int io_rsrc_update_prep(struct io_kiocb *req, static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; struct io_uring_rsrc_update2 up; int ret; @@ -6849,10 +7999,10 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) up.resv = 0; up.resv2 = 0; - io_ring_submit_lock(ctx, needs_lock); + io_ring_submit_lock(ctx, issue_flags); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, req->rsrc_update.nr_args); - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) req_set_fail(req); @@ -6864,7 +8014,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { switch (req->opcode) { case IORING_OP_NOP: - return 0; + return io_nop_prep(req, sqe); case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: @@ -6938,6 +8088,18 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_linkat_prep(req, sqe); case IORING_OP_MSG_RING: return io_msg_ring_prep(req, sqe); + case IORING_OP_FSETXATTR: + return io_fsetxattr_prep(req, sqe); + case IORING_OP_SETXATTR: + return io_setxattr_prep(req, sqe); + case IORING_OP_FGETXATTR: + return io_fgetxattr_prep(req, sqe); + case IORING_OP_GETXATTR: + return io_getxattr_prep(req, sqe); + case IORING_OP_SOCKET: + return io_socket_prep(req, sqe); + case IORING_OP_URING_CMD: + return io_uring_cmd_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6951,7 +8113,7 @@ static int io_req_prep_async(struct io_kiocb *req) /* assign early for deferred execution for non-fixed file */ if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) - req->file = io_file_get_normal(req, req->fd); + req->file = io_file_get_normal(req, req->cqe.fd); if (!def->needs_async_setup) return 0; if (WARN_ON_ONCE(req_has_async_data(req))) @@ -6970,6 +8132,8 @@ static int io_req_prep_async(struct io_kiocb *req) return io_recvmsg_prep_async(req); case IORING_OP_CONNECT: return io_connect_prep_async(req); + case IORING_OP_URING_CMD: + return io_uring_cmd_prep_async(req); } printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", req->opcode); @@ -6979,9 +8143,10 @@ static int io_req_prep_async(struct io_kiocb *req) static u32 io_get_sequence(struct io_kiocb *req) { u32 seq = req->ctx->cached_sq_head; + struct io_kiocb *cur; /* need original cached_sq_head, but it was increased for each req */ - io_for_each_link(req, req) + io_for_each_link(cur, req) seq--; return seq; } @@ -7024,7 +8189,7 @@ fail: goto queue; } - trace_io_uring_defer(ctx, req, req->user_data, req->opcode); + trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode); de->req = req; de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); @@ -7086,6 +8251,12 @@ static void io_clean_op(struct io_kiocb *req) if (req->statx.filename) putname(req->statx.filename); break; + case IORING_OP_SETXATTR: + case IORING_OP_FSETXATTR: + case IORING_OP_GETXATTR: + case IORING_OP_FGETXATTR: + __io_xattr_finish(req); + break; } } if ((req->flags & REQ_F_POLLED) && req->apoll) { @@ -7108,15 +8279,11 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) return true; if (req->flags & REQ_F_FIXED_FILE) - req->file = io_file_get_fixed(req, req->fd, issue_flags); + req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags); else - req->file = io_file_get_normal(req, req->fd); - if (req->file) - return true; + req->file = io_file_get_normal(req, req->cqe.fd); - req_set_fail(req); - req->result = -EBADF; - return false; + return !!req->file; } static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) @@ -7246,6 +8413,24 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) case IORING_OP_MSG_RING: ret = io_msg_ring(req, issue_flags); break; + case IORING_OP_FSETXATTR: + ret = io_fsetxattr(req, issue_flags); + break; + case IORING_OP_SETXATTR: + ret = io_setxattr(req, issue_flags); + break; + case IORING_OP_FGETXATTR: + ret = io_fgetxattr(req, issue_flags); + break; + case IORING_OP_GETXATTR: + ret = io_getxattr(req, issue_flags); + break; + case IORING_OP_SOCKET: + ret = io_socket(req, issue_flags); + break; + case IORING_OP_URING_CMD: + ret = io_uring_cmd(req, issue_flags); + break; default: ret = -EINVAL; break; @@ -7279,7 +8464,6 @@ static void io_wq_submit_work(struct io_wq_work *work) const struct io_op_def *def = &io_op_defs[req->opcode]; unsigned int issue_flags = IO_URING_F_UNLOCKED; bool needs_poll = false; - struct io_kiocb *timeout; int ret = 0, err = -ECANCELED; /* one will be dropped by ->io_free_work() after returning to io-wq */ @@ -7288,10 +8472,7 @@ static void io_wq_submit_work(struct io_wq_work *work) else req_ref_get(req); - timeout = io_prep_linked_timeout(req); - if (timeout) - io_queue_linked_timeout(timeout); - + io_arm_ltimeout(req); /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (work->flags & IO_WQ_WORK_CANCEL) { @@ -7324,6 +8505,8 @@ fail: * wait for request slots on the block side. */ if (!needs_poll) { + if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) + break; cond_resched(); continue; } @@ -7369,8 +8552,7 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, struct file *file = NULL; unsigned long file_ptr; - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_lock(&ctx->uring_lock); + io_ring_submit_lock(ctx, issue_flags); if (unlikely((unsigned int)fd >= ctx->nr_user_files)) goto out; @@ -7381,9 +8563,9 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, /* mask in overlapping REQ_F and FFS bits */ req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); io_req_set_rsrc_node(req, ctx, 0); + WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap)); out: - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_unlock(&ctx->uring_lock); + io_ring_submit_unlock(ctx, issue_flags); return file; } @@ -7404,7 +8586,7 @@ static struct file *io_file_get_normal(struct io_kiocb *req, int fd) { struct file *file = fget(fd); - trace_io_uring_file_get(req->ctx, req, req->user_data, fd); + trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd); /* we don't allow fixed io_uring files */ if (file && file->f_op == &io_uring_fops) @@ -7418,8 +8600,14 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) int ret = -ENOENT; if (prev) { - if (!(req->task->flags & PF_EXITING)) - ret = io_try_cancel_userdata(req, prev->user_data); + if (!(req->task->flags & PF_EXITING)) { + struct io_cancel_data cd = { + .ctx = req->ctx, + .data = prev->cqe.user_data, + }; + + ret = io_try_cancel(req, &cd); + } io_req_complete_post(req, ret ?: -ETIME, 0); io_put_req(prev); } else { @@ -7453,7 +8641,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; - io_req_task_work_add(req, false); + io_req_task_work_add(req); return HRTIMER_NORESTART; } @@ -7479,10 +8667,17 @@ static void io_queue_linked_timeout(struct io_kiocb *req) io_put_req(req); } -static void io_queue_sqe_arm_apoll(struct io_kiocb *req) +static void io_queue_async(struct io_kiocb *req, int ret) __must_hold(&req->ctx->uring_lock) { - struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + struct io_kiocb *linked_timeout; + + if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { + io_req_complete_failed(req, ret); + return; + } + + linked_timeout = io_prep_linked_timeout(req); switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: @@ -7493,7 +8688,7 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req) * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. */ - io_queue_async_work(req, NULL); + io_queue_iowq(req, NULL); break; case IO_APOLL_OK: break; @@ -7503,10 +8698,9 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req) io_queue_linked_timeout(linked_timeout); } -static inline void __io_queue_sqe(struct io_kiocb *req) +static inline void io_queue_sqe(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { - struct io_kiocb *linked_timeout; int ret; ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); @@ -7519,22 +8713,23 @@ static inline void __io_queue_sqe(struct io_kiocb *req) * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ - if (likely(!ret)) { - linked_timeout = io_prep_linked_timeout(req); - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); - } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { - io_queue_sqe_arm_apoll(req); - } else { - io_req_complete_failed(req, ret); - } + if (likely(!ret)) + io_arm_ltimeout(req); + else + io_queue_async(req, ret); } static void io_queue_sqe_fallback(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { - if (req->flags & REQ_F_FAIL) { - io_req_complete_fail_submit(req); + if (unlikely(req->flags & REQ_F_FAIL)) { + /* + * We don't submit, fail them all, for that replace hardlinks + * with normal links. Extra REQ_F_LINK is tolerated. + */ + req->flags &= ~REQ_F_HARDLINK; + req->flags |= REQ_F_LINK; + io_req_complete_failed(req, req->cqe.res); } else if (unlikely(req->ctx->drain_active)) { io_drain_req(req); } else { @@ -7543,19 +8738,10 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) if (unlikely(ret)) io_req_complete_failed(req, ret); else - io_queue_async_work(req, NULL); + io_queue_iowq(req, NULL); } } -static inline void io_queue_sqe(struct io_kiocb *req) - __must_hold(&req->ctx->uring_lock) -{ - if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) - __io_queue_sqe(req); - else - io_queue_sqe_fallback(req); -} - /* * Check SQE restrictions (opcode and flags). * @@ -7610,9 +8796,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags = sqe_flags = READ_ONCE(sqe->flags); - req->user_data = READ_ONCE(sqe->user_data); + req->cqe.user_data = READ_ONCE(sqe->user_data); req->file = NULL; - req->fixed_rsrc_refs = NULL; + req->rsrc_node = NULL; req->task = current; if (unlikely(opcode >= IORING_OP_LAST)) { @@ -7623,9 +8809,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) return -EINVAL; - if ((sqe_flags & IOSQE_BUFFER_SELECT) && - !io_op_defs[opcode].buffer_select) - return -EOPNOTSUPP; + if (sqe_flags & IOSQE_BUFFER_SELECT) { + if (!io_op_defs[opcode].buffer_select) + return -EOPNOTSUPP; + req->buf_index = READ_ONCE(sqe->buf_group); + } if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) ctx->drain_disabled = true; if (sqe_flags & IOSQE_IO_DRAIN) { @@ -7648,10 +8836,15 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } } + if (!io_op_defs[opcode].ioprio && sqe->ioprio) + return -EINVAL; + if (!io_op_defs[opcode].iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (io_op_defs[opcode].needs_file) { struct io_submit_state *state = &ctx->submit_state; - req->fd = READ_ONCE(sqe->fd); + req->cqe.fd = READ_ONCE(sqe->fd); /* * Plug now if we have more than 2 IO left after this, and the @@ -7683,7 +8876,44 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, return io_req_prep(req, sqe); } -static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, +static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, + struct io_kiocb *req, int ret) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_submit_link *link = &ctx->submit_state.link; + struct io_kiocb *head = link->head; + + trace_io_uring_req_failed(sqe, ctx, req, ret); + + /* + * Avoid breaking links in the middle as it renders links with SQPOLL + * unusable. Instead of failing eagerly, continue assembling the link if + * applicable and mark the head with REQ_F_FAIL. The link flushing code + * should find the flag and handle the rest. + */ + req_fail_link_node(req, ret); + if (head && !(head->flags & REQ_F_FAIL)) + req_fail_link_node(head, -ECANCELED); + + if (!(req->flags & IO_REQ_LINK_FLAGS)) { + if (head) { + link->last->link = req; + link->head = NULL; + req = head; + } + io_queue_sqe_fallback(req); + return ret; + } + + if (head) + link->last->link = req; + else + link->head = req; + link->last = req; + return 0; +} + +static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { @@ -7691,35 +8921,11 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, int ret; ret = io_init_req(ctx, req, sqe); - if (unlikely(ret)) { - trace_io_uring_req_failed(sqe, ctx, req, ret); - - /* fail even hard links since we don't submit */ - if (link->head) { - /* - * we can judge a link req is failed or cancelled by if - * REQ_F_FAIL is set, but the head is an exception since - * it may be set REQ_F_FAIL because of other req's failure - * so let's leverage req->result to distinguish if a head - * is set REQ_F_FAIL because of its failure or other req's - * failure so that we can set the correct ret code for it. - * init result here to avoid affecting the normal path. - */ - if (!(link->head->flags & REQ_F_FAIL)) - req_fail_link_node(link->head, -ECANCELED); - } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - /* - * the current req is a normal req, we should return - * error and thus break the submittion loop. - */ - io_req_complete_failed(req, ret); - return ret; - } - req_fail_link_node(req, ret); - } + if (unlikely(ret)) + return io_submit_fail_init(sqe, req, ret); /* don't need @sqe from now on */ - trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode, + trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode, req->flags, true, ctx->flags & IORING_SETUP_SQPOLL); @@ -7730,29 +8936,32 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, * submitted sync once the chain is complete. If none of those * conditions are true (normal request), then just queue it. */ - if (link->head) { - struct io_kiocb *head = link->head; - - if (!(req->flags & REQ_F_FAIL)) { - ret = io_req_prep_async(req); - if (unlikely(ret)) { - req_fail_link_node(req, ret); - if (!(head->flags & REQ_F_FAIL)) - req_fail_link_node(head, -ECANCELED); - } - } - trace_io_uring_link(ctx, req, head); + if (unlikely(link->head)) { + ret = io_req_prep_async(req); + if (unlikely(ret)) + return io_submit_fail_init(sqe, req, ret); + + trace_io_uring_link(ctx, req, link->head); link->last->link = req; link->last = req; - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) + if (req->flags & IO_REQ_LINK_FLAGS) return 0; - /* last request of a link, enqueue the link */ + /* last request of the link, flush it */ + req = link->head; link->head = NULL; - req = head; - } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - link->head = req; - link->last = req; + if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)) + goto fallback; + + } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS | + REQ_F_FORCE_ASYNC | REQ_F_FAIL))) { + if (req->flags & IO_REQ_LINK_FLAGS) { + link->head = req; + link->last = req; + } else { +fallback: + io_queue_sqe_fallback(req); + } return 0; } @@ -7767,8 +8976,8 @@ static void io_submit_state_end(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; - if (state->link.head) - io_queue_sqe(state->link.head); + if (unlikely(state->link.head)) + io_queue_sqe_fallback(state->link.head); /* flush only after queuing links as they can generate completions */ io_submit_flush_completions(ctx); if (state->plug_started) @@ -7822,8 +9031,12 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) * though the application is the one updating it. */ head = READ_ONCE(ctx->sq_array[sq_idx]); - if (likely(head < ctx->sq_entries)) + if (likely(head < ctx->sq_entries)) { + /* double index for 128-byte SQEs, twice as long */ + if (ctx->flags & IORING_SETUP_SQE128) + head <<= 1; return &ctx->sq_sqes[head]; + } /* drop invalid entries */ ctx->cq_extra--; @@ -7836,54 +9049,52 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { unsigned int entries = io_sqring_entries(ctx); - int submitted = 0; + unsigned int left; + int ret; if (unlikely(!entries)) return 0; /* make sure SQ entry isn't read before tail */ - nr = min3(nr, ctx->sq_entries, entries); - io_get_task_refs(nr); + ret = left = min3(nr, ctx->sq_entries, entries); + io_get_task_refs(left); + io_submit_state_start(&ctx->submit_state, left); - io_submit_state_start(&ctx->submit_state, nr); do { const struct io_uring_sqe *sqe; struct io_kiocb *req; - if (unlikely(!io_alloc_req_refill(ctx))) { - if (!submitted) - submitted = -EAGAIN; + if (unlikely(!io_alloc_req_refill(ctx))) break; - } req = io_alloc_req(ctx); sqe = io_get_sqe(ctx); if (unlikely(!sqe)) { - wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); + io_req_add_to_cache(req, ctx); break; } - /* will complete beyond this point, count as submitted */ - submitted++; - if (io_submit_sqe(ctx, req, sqe)) { - /* - * Continue submitting even for sqe failure if the - * ring was setup with IORING_SETUP_SUBMIT_ALL - */ - if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL)) - break; - } - } while (submitted < nr); - if (unlikely(submitted != nr)) { - int ref_used = (submitted == -EAGAIN) ? 0 : submitted; - int unused = nr - ref_used; + /* + * Continue submitting even for sqe failure if the + * ring was setup with IORING_SETUP_SUBMIT_ALL + */ + if (unlikely(io_submit_sqe(ctx, req, sqe)) && + !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { + left--; + break; + } + } while (--left); - current->io_uring->cached_refs += unused; + if (unlikely(left)) { + ret -= left; + /* try again if it submitted nothing and can't allocate a req */ + if (!ret && io_req_cache_empty(ctx)) + ret = -EAGAIN; + current->io_uring->cached_refs += left; } io_submit_state_end(ctx); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); - - return submitted; + return ret; } static inline bool io_sqd_events_pending(struct io_sq_data *sqd) @@ -7891,23 +9102,6 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd) return READ_ONCE(sqd->state); } -static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) -{ - /* Tell userspace we may need a wakeup call */ - spin_lock(&ctx->completion_lock); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP); - spin_unlock(&ctx->completion_lock); -} - -static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) -{ - spin_lock(&ctx->completion_lock); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP); - spin_unlock(&ctx->completion_lock); -} - static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) { unsigned int to_submit; @@ -8023,8 +9217,8 @@ static int io_sq_thread(void *data) bool needs_sched = true; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - io_ring_set_wakeup_flag(ctx); - + atomic_or(IORING_SQ_NEED_WAKEUP, + &ctx->rings->sq_flags); if ((ctx->flags & IORING_SETUP_IOPOLL) && !wq_list_empty(&ctx->iopoll_list)) { needs_sched = false; @@ -8035,7 +9229,7 @@ static int io_sq_thread(void *data) * Ensure the store of the wakeup flag is not * reordered with the load of the SQ tail */ - smp_mb(); + smp_mb__after_atomic(); if (io_sqring_entries(ctx)) { needs_sched = false; @@ -8049,7 +9243,8 @@ static int io_sq_thread(void *data) mutex_lock(&sqd->lock); } list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_ring_clear_wakeup_flag(ctx); + atomic_andnot(IORING_SQ_NEED_WAKEUP, + &ctx->rings->sq_flags); } finish_wait(&sqd->wait, &wait); @@ -8059,7 +9254,7 @@ static int io_sq_thread(void *data) io_uring_cancel_generic(true, sqd); sqd->thread = NULL; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_ring_set_wakeup_flag(ctx); + atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); io_run_task_work(); mutex_unlock(&sqd->lock); @@ -8099,7 +9294,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, * Cannot safely flush overflowed CQEs from here, ensure we wake up * the task, and the next invocation will do it. */ - if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow)) + if (io_should_wake(iowq) || + test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq)) return autoremove_wake_function(curr, mode, wake_flags, key); return -1; } @@ -8121,15 +9317,18 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, ktime_t timeout) { int ret; + unsigned long check_cq; /* make sure we run task_work before checking for signals */ ret = io_run_task_work_sig(); if (ret || io_should_wake(iowq)) return ret; + check_cq = READ_ONCE(ctx->check_cq); /* let the caller flush overflows, retry */ - if (test_bit(0, &ctx->check_cq_overflow)) + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) return 1; - + if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) + return -EBADR; if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) return -ETIME; return 1; @@ -8194,10 +9393,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); ret = io_cqring_wait_schedule(ctx, &iowq, timeout); - finish_wait(&ctx->cq_wait, &iowq.wq); cond_resched(); } while (ret > 0); + finish_wait(&ctx->cq_wait, &iowq.wq); restore_saved_sigmask_unless(ret == -EINTR); return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; @@ -8435,17 +9634,57 @@ static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) { table->files = kvcalloc(nr_files, sizeof(table->files[0]), GFP_KERNEL_ACCOUNT); - return !!table->files; + if (unlikely(!table->files)) + return false; + + table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT); + if (unlikely(!table->bitmap)) { + kvfree(table->files); + return false; + } + + return true; } static void io_free_file_tables(struct io_file_table *table) { kvfree(table->files); + bitmap_free(table->bitmap); table->files = NULL; + table->bitmap = NULL; +} + +static inline void io_file_bitmap_set(struct io_file_table *table, int bit) +{ + WARN_ON_ONCE(test_bit(bit, table->bitmap)); + __set_bit(bit, table->bitmap); + if (bit == table->alloc_hint) + table->alloc_hint++; +} + +static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) +{ + __clear_bit(bit, table->bitmap); + table->alloc_hint = bit; } static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) { +#if !defined(IO_URING_SCM_ALL) + int i; + + for (i = 0; i < ctx->nr_user_files; i++) { + struct file *file = io_file_from_index(ctx, i); + + if (!file) + continue; + if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) + continue; + io_file_bitmap_clear(&ctx->file_table, i); + fput(file); + } +#endif + #if defined(CONFIG_UNIX) if (ctx->ring_sock) { struct sock *sock = ctx->ring_sock->sk; @@ -8454,16 +9693,6 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) kfree_skb(skb); } -#else - int i; - - for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file; - - file = io_file_from_index(ctx, i); - if (file) - fput(file); - } #endif io_free_file_tables(&ctx->file_table); io_rsrc_data_free(ctx->file_data); @@ -8608,107 +9837,66 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, return sqd; } -#if defined(CONFIG_UNIX) /* * Ensure the UNIX gc is aware of our file set, so we are certain that * the io_uring can be safely unregistered on process exit, even if we have - * loops in the file referencing. + * loops in the file referencing. We account only files that can hold other + * files because otherwise they can't form a loop and so are not interesting + * for GC. */ -static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) +static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) { +#if defined(CONFIG_UNIX) struct sock *sk = ctx->ring_sock->sk; + struct sk_buff_head *head = &sk->sk_receive_queue; struct scm_fp_list *fpl; struct sk_buff *skb; - int i, nr_files; - fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); - if (!fpl) - return -ENOMEM; - - skb = alloc_skb(0, GFP_KERNEL); - if (!skb) { - kfree(fpl); - return -ENOMEM; - } + if (likely(!io_file_need_scm(file))) + return 0; - skb->sk = sk; + /* + * See if we can merge this file into an existing skb SCM_RIGHTS + * file set. If there's no room, fall back to allocating a new skb + * and filling it in. + */ + spin_lock_irq(&head->lock); + skb = skb_peek(head); + if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) + __skb_unlink(skb, head); + else + skb = NULL; + spin_unlock_irq(&head->lock); - nr_files = 0; - fpl->user = get_uid(current_user()); - for (i = 0; i < nr; i++) { - struct file *file = io_file_from_index(ctx, i + offset); + if (!skb) { + fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); + if (!fpl) + return -ENOMEM; - if (!file) - continue; - fpl->fp[nr_files] = get_file(file); - unix_inflight(fpl->user, fpl->fp[nr_files]); - nr_files++; - } + skb = alloc_skb(0, GFP_KERNEL); + if (!skb) { + kfree(fpl); + return -ENOMEM; + } - if (nr_files) { + fpl->user = get_uid(current_user()); fpl->max = SCM_MAX_FD; - fpl->count = nr_files; + fpl->count = 0; + UNIXCB(skb).fp = fpl; + skb->sk = sk; skb->destructor = unix_destruct_scm; refcount_add(skb->truesize, &sk->sk_wmem_alloc); - skb_queue_head(&sk->sk_receive_queue, skb); - - for (i = 0; i < nr; i++) { - struct file *file = io_file_from_index(ctx, i + offset); - - if (file) - fput(file); - } - } else { - kfree_skb(skb); - free_uid(fpl->user); - kfree(fpl); - } - - return 0; -} - -/* - * If UNIX sockets are enabled, fd passing can cause a reference cycle which - * causes regular reference counting to break down. We rely on the UNIX - * garbage collection to take care of this problem for us. - */ -static int io_sqe_files_scm(struct io_ring_ctx *ctx) -{ - unsigned left, total; - int ret = 0; - - total = 0; - left = ctx->nr_user_files; - while (left) { - unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); - - ret = __io_sqe_files_scm(ctx, this_files, total); - if (ret) - break; - left -= this_files; - total += this_files; - } - - if (!ret) - return 0; - - while (total < ctx->nr_user_files) { - struct file *file = io_file_from_index(ctx, total); - - if (file) - fput(file); - total++; } - return ret; -} -#else -static int io_sqe_files_scm(struct io_ring_ctx *ctx) -{ + fpl = UNIXCB(skb).fp; + fpl->fp[fpl->count++] = get_file(file); + unix_inflight(fpl->user, file); + skb_queue_head(head, skb); + fput(file); +#endif return 0; } -#endif static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) { @@ -8719,6 +9907,11 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) struct sk_buff *skb; int i; + if (!io_file_need_scm(file)) { + fput(file); + return; + } + __skb_queue_head_init(&list); /* @@ -8783,15 +9976,17 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) list_del(&prsrc->list); if (prsrc->tag) { - bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_lock(&ctx->uring_lock); - io_ring_submit_lock(ctx, lock_ring); spin_lock(&ctx->completion_lock); io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); - io_ring_submit_unlock(ctx, lock_ring); + + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_unlock(&ctx->uring_lock); } rsrc_data->do_put(ctx, prsrc); @@ -8845,27 +10040,31 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, if (ret) return ret; - ret = -ENOMEM; - if (!io_alloc_file_tables(&ctx->file_table, nr_args)) - goto out_free; + if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { + io_rsrc_data_free(ctx->file_data); + ctx->file_data = NULL; + return -ENOMEM; + } for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { - if (copy_from_user(&fd, &fds[i], sizeof(fd))) { + struct io_fixed_file *file_slot; + + if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { ret = -EFAULT; - goto out_fput; + goto fail; } /* allow sparse sets */ - if (fd == -1) { + if (!fds || fd == -1) { ret = -EINVAL; if (unlikely(*io_get_tag_slot(ctx->file_data, i))) - goto out_fput; + goto fail; continue; } file = fget(fd); ret = -EBADF; if (unlikely(!file)) - goto out_fput; + goto fail; /* * Don't allow io_uring instances to be registered. If UNIX @@ -8876,74 +10075,23 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, */ if (file->f_op == &io_uring_fops) { fput(file); - goto out_fput; + goto fail; } - io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file); - } - - ret = io_sqe_files_scm(ctx); - if (ret) { - __io_sqe_files_unregister(ctx); - return ret; - } - - io_rsrc_node_switch(ctx, NULL); - return ret; -out_fput: - for (i = 0; i < ctx->nr_user_files; i++) { - file = io_file_from_index(ctx, i); - if (file) + ret = io_scm_file_account(ctx, file); + if (ret) { fput(file); - } - io_free_file_tables(&ctx->file_table); - ctx->nr_user_files = 0; -out_free: - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; - return ret; -} - -static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, - int index) -{ -#if defined(CONFIG_UNIX) - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff_head *head = &sock->sk_receive_queue; - struct sk_buff *skb; - - /* - * See if we can merge this file into an existing skb SCM_RIGHTS - * file set. If there's no room, fall back to allocating a new skb - * and filling it in. - */ - spin_lock_irq(&head->lock); - skb = skb_peek(head); - if (skb) { - struct scm_fp_list *fpl = UNIXCB(skb).fp; - - if (fpl->count < SCM_MAX_FD) { - __skb_unlink(skb, head); - spin_unlock_irq(&head->lock); - fpl->fp[fpl->count] = get_file(file); - unix_inflight(fpl->user, fpl->fp[fpl->count]); - fpl->count++; - spin_lock_irq(&head->lock); - __skb_queue_head(head, skb); - } else { - skb = NULL; + goto fail; } - } - spin_unlock_irq(&head->lock); - - if (skb) { - fput(file); - return 0; + file_slot = io_fixed_file_slot(&ctx->file_table, i); + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, i); } - return __io_sqe_files_scm(ctx, 1, index); -#else + io_rsrc_node_switch(ctx, NULL); return 0; -#endif +fail: + __io_sqe_files_unregister(ctx); + return ret; } static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, @@ -8967,12 +10115,11 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index) { struct io_ring_ctx *ctx = req->ctx; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; bool needs_switch = false; struct io_fixed_file *file_slot; int ret = -EBADF; - io_ring_submit_lock(ctx, needs_lock); + io_ring_submit_lock(ctx, issue_flags); if (file->f_op == &io_uring_fops) goto err; ret = -ENXIO; @@ -8998,22 +10145,20 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, if (ret) goto err; file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, slot_index); needs_switch = true; } - *io_get_tag_slot(ctx->file_data, slot_index) = 0; - io_fixed_file_set(file_slot, file); - ret = io_sqe_file_register(ctx, file, slot_index); - if (ret) { - file_slot->file_ptr = 0; - goto err; + ret = io_scm_file_account(ctx, file); + if (!ret) { + *io_get_tag_slot(ctx->file_data, slot_index) = 0; + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, slot_index); } - - ret = 0; err: if (needs_switch) io_rsrc_node_switch(ctx, ctx->file_data); - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); if (ret) fput(file); return ret; @@ -9023,12 +10168,11 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) { unsigned int offset = req->close.file_slot - 1; struct io_ring_ctx *ctx = req->ctx; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; struct io_fixed_file *file_slot; struct file *file; int ret; - io_ring_submit_lock(ctx, needs_lock); + io_ring_submit_lock(ctx, issue_flags); ret = -ENXIO; if (unlikely(!ctx->file_data)) goto out; @@ -9051,10 +10195,11 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) goto out; file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, offset); io_rsrc_node_switch(ctx, ctx->file_data); ret = 0; out: - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); return ret; } @@ -9100,6 +10245,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (err) break; file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, i); needs_switch = true; } if (fd != -1) { @@ -9121,14 +10267,14 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - *io_get_tag_slot(data, i) = tag; - io_fixed_file_set(file_slot, file); - err = io_sqe_file_register(ctx, file, i); + err = io_scm_file_account(ctx, file); if (err) { - file_slot->file_ptr = 0; fput(file); break; } + *io_get_tag_slot(data, i) = tag; + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, i); } } @@ -9208,7 +10354,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task, task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prior_task_list); + INIT_WQ_LIST(&tctx->prio_task_list); init_task_work(&tctx->task_work, tctx_task_work); return 0; } @@ -9386,8 +10532,8 @@ static void *io_mem_alloc(size_t size) return (void *) __get_free_pages(gfp, get_order(size)); } -static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, - size_t *sq_offset) +static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, + unsigned int cq_entries, size_t *sq_offset) { struct io_rings *rings; size_t off, sq_array_size; @@ -9395,6 +10541,10 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, off = struct_size(rings, cqes, cq_entries); if (off == SIZE_MAX) return SIZE_MAX; + if (ctx->flags & IORING_SETUP_CQE32) { + if (check_shl_overflow(off, 1, &off)) + return SIZE_MAX; + } #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); @@ -9556,30 +10706,18 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf **pimu, - struct page **last_hpage) +static struct page **io_pin_pages(unsigned long ubuf, unsigned long len, + int *npages) { - struct io_mapped_ubuf *imu = NULL; + unsigned long start, end, nr_pages; struct vm_area_struct **vmas = NULL; struct page **pages = NULL; - unsigned long off, start, end, ubuf; - size_t size; - int ret, pret, nr_pages, i; + int i, pret, ret = -ENOMEM; - if (!iov->iov_base) { - *pimu = ctx->dummy_ubuf; - return 0; - } - - ubuf = (unsigned long) iov->iov_base; - end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = ubuf >> PAGE_SHIFT; nr_pages = end - start; - *pimu = NULL; - ret = -ENOMEM; - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto done; @@ -9589,10 +10727,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, if (!vmas) goto done; - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); - if (!imu) - goto done; - ret = 0; mmap_read_lock(current->mm); pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, @@ -9610,6 +10744,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, break; } } + *npages = nr_pages; } else { ret = pret < 0 ? pret : -EFAULT; } @@ -9623,14 +10758,53 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, unpin_user_pages(pages, pret); goto done; } + ret = 0; +done: + kvfree(vmas); + if (ret < 0) { + kvfree(pages); + pages = ERR_PTR(ret); + } + return pages; +} + +static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, + struct io_mapped_ubuf **pimu, + struct page **last_hpage) +{ + struct io_mapped_ubuf *imu = NULL; + struct page **pages = NULL; + unsigned long off; + size_t size; + int ret, nr_pages, i; + + if (!iov->iov_base) { + *pimu = ctx->dummy_ubuf; + return 0; + } - ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage); + *pimu = NULL; + ret = -ENOMEM; + + pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, + &nr_pages); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + pages = NULL; + goto done; + } + + imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); + if (!imu) + goto done; + + ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); if (ret) { - unpin_user_pages(pages, pret); + unpin_user_pages(pages, nr_pages); goto done; } - off = ubuf & ~PAGE_MASK; + off = (unsigned long) iov->iov_base & ~PAGE_MASK; size = iov->iov_len; for (i = 0; i < nr_pages; i++) { size_t vec_len; @@ -9643,8 +10817,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, size -= vec_len; } /* store original address for later verification */ - imu->ubuf = ubuf; - imu->ubuf_end = ubuf + iov->iov_len; + imu->ubuf = (unsigned long) iov->iov_base; + imu->ubuf_end = imu->ubuf + iov->iov_len; imu->nr_bvecs = nr_pages; *pimu = imu; ret = 0; @@ -9652,7 +10826,6 @@ done: if (ret) kvfree(imu); kvfree(pages); - kvfree(vmas); return ret; } @@ -9711,12 +10884,17 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, } for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { - ret = io_copy_iov(ctx, &iov, arg, i); - if (ret) - break; - ret = io_buffer_validate(&iov); - if (ret) - break; + if (arg) { + ret = io_copy_iov(ctx, &iov, arg, i); + if (ret) + break; + ret = io_buffer_validate(&iov); + if (ret) + break; + } else { + memset(&iov, 0, sizeof(iov)); + } + if (!iov.iov_base && *io_get_tag_slot(data, i)) { ret = -EINVAL; break; @@ -9855,19 +11033,19 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) static void io_destroy_buffers(struct io_ring_ctx *ctx) { + struct io_buffer_list *bl; + unsigned long index; int i; - for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) { - struct list_head *list = &ctx->io_buffers[i]; - - while (!list_empty(list)) { - struct io_buffer_list *bl; + for (i = 0; i < BGID_ARRAY; i++) { + if (!ctx->io_bl) + break; + __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); + } - bl = list_first_entry(list, struct io_buffer_list, list); - __io_remove_buffers(ctx, bl, -1U); - list_del(&bl->list); - kfree(bl); - } + xa_for_each(&ctx->io_bl_xa, index, bl) { + xa_erase(&ctx->io_bl_xa, bl->bgid); + __io_remove_buffers(ctx, bl, -1U); } while (!list_empty(&ctx->io_buffers_pages)) { @@ -9887,7 +11065,7 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); io_flush_cached_locked_reqs(ctx, state); - while (state->free_list.next) { + while (!io_req_cache_empty(ctx)) { struct io_wq_work_node *node; struct io_kiocb *req; @@ -9976,7 +11154,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_wq_put_hash(ctx->hash_map); kfree(ctx->cancel_hash); kfree(ctx->dummy_ubuf); - kfree(ctx->io_buffers); + kfree(ctx->io_bl); + xa_destroy(&ctx->io_bl_xa); kfree(ctx); } @@ -10007,7 +11186,8 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * Users may get EPOLLIN meanwhile seeing nothing in cqring, this * pushs them to do the flush. */ - if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow)) + if (io_cqring_events(ctx) || + test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) mask |= EPOLLIN | EPOLLRDNORM; return mask; @@ -10139,8 +11319,7 @@ static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, } } spin_unlock_irq(&ctx->timeout_lock); - if (canceled != 0) - io_commit_cqring(ctx); + io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); if (canceled != 0) io_cqring_ev_posted(ctx); @@ -10160,11 +11339,13 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_unregister_personality(ctx, index); mutex_unlock(&ctx->uring_lock); - io_kill_timeouts(ctx, NULL, true); - io_poll_remove_all(ctx, NULL, true); - - /* if we failed setting up the ctx, we might not have any rings */ - io_iopoll_try_reap_events(ctx); + /* failed during ring init, it couldn't have issued any requests */ + if (ctx->rings) { + io_kill_timeouts(ctx, NULL, true); + io_poll_remove_all(ctx, NULL, true); + /* if we failed setting up the ctx, we might not have any rings */ + io_iopoll_try_reap_events(ctx); + } INIT_WORK(&ctx->exit_work, io_ring_exit_work); /* @@ -10256,6 +11437,10 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; struct io_uring_task *tctx = task ? task->io_uring : NULL; + /* failed during ring init, it couldn't have issued any requests */ + if (!ctx->rings) + return; + while (1) { enum io_wq_cancel cret; bool ret = false; @@ -10701,6 +11886,19 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) return 0; } +static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) +{ + if (flags & IORING_ENTER_EXT_ARG) { + struct io_uring_getevents_arg arg; + + if (argsz != sizeof(arg)) + return -EINVAL; + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + } + return 0; +} + static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, struct __kernel_timespec __user **ts, const sigset_t __user **sig) @@ -10738,7 +11936,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, size_t, argsz) { struct io_ring_ctx *ctx; - int submitted = 0; struct fd f; long ret; @@ -10801,39 +11998,64 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (ret) goto out; } - submitted = to_submit; + ret = to_submit; } else if (to_submit) { ret = io_uring_add_tctx_node(ctx); if (unlikely(ret)) goto out; - mutex_lock(&ctx->uring_lock); - submitted = io_submit_sqes(ctx, to_submit); - mutex_unlock(&ctx->uring_lock); - if (submitted != to_submit) + mutex_lock(&ctx->uring_lock); + ret = io_submit_sqes(ctx, to_submit); + if (ret != to_submit) { + mutex_unlock(&ctx->uring_lock); goto out; + } + if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll) + goto iopoll_locked; + mutex_unlock(&ctx->uring_lock); } if (flags & IORING_ENTER_GETEVENTS) { - const sigset_t __user *sig; - struct __kernel_timespec __user *ts; - - ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); - if (unlikely(ret)) - goto out; + int ret2; + if (ctx->syscall_iopoll) { + /* + * We disallow the app entering submit/complete with + * polling, but we still need to lock the ring to + * prevent racing with polled issue that got punted to + * a workqueue. + */ + mutex_lock(&ctx->uring_lock); +iopoll_locked: + ret2 = io_validate_ext_arg(flags, argp, argsz); + if (likely(!ret2)) { + min_complete = min(min_complete, + ctx->cq_entries); + ret2 = io_iopoll_check(ctx, min_complete); + } + mutex_unlock(&ctx->uring_lock); + } else { + const sigset_t __user *sig; + struct __kernel_timespec __user *ts; + + ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); + if (likely(!ret2)) { + min_complete = min(min_complete, + ctx->cq_entries); + ret2 = io_cqring_wait(ctx, min_complete, sig, + argsz, ts); + } + } - min_complete = min(min_complete, ctx->cq_entries); + if (!ret) { + ret = ret2; - /* - * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user - * space applications don't need to do io completion events - * polling again, they can rely on io_sq_thread to do polling - * work, which can reduce cpu usage and uring_lock contention. - */ - if (ctx->flags & IORING_SETUP_IOPOLL && - !(ctx->flags & IORING_SETUP_SQPOLL)) { - ret = io_iopoll_check(ctx, min_complete); - } else { - ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); + /* + * EBADR indicates that one or more CQE were dropped. + * Once the user has been informed we can clear the bit + * as they are obviously ok with those drops. + */ + if (unlikely(ret2 == -EBADR)) + clear_bit(IO_CHECK_CQ_DROPPED_BIT, + &ctx->check_cq); } } @@ -10842,7 +12064,7 @@ out: out_fput: if (!(flags & IORING_ENTER_REGISTERED_RING)) fdput(f); - return submitted ? submitted : ret; + return ret; } #ifdef CONFIG_PROC_FS @@ -10889,10 +12111,15 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, unsigned int sq_tail = READ_ONCE(r->sq.tail); unsigned int cq_head = READ_ONCE(r->cq.head); unsigned int cq_tail = READ_ONCE(r->cq.tail); + unsigned int cq_shift = 0; unsigned int sq_entries, cq_entries; bool has_lock; + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); unsigned int i; + if (is_cqe32) + cq_shift = 1; + /* * we may get imprecise sqe and cqe info if uring is actively running * since we get cached_sq_head and cached_cq_tail without uring_lock @@ -10925,11 +12152,18 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, cq_entries = min(cq_tail - cq_head, ctx->cq_entries); for (i = 0; i < cq_entries; i++) { unsigned int entry = i + cq_head; - struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask]; + struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", + if (!is_cqe32) { + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", entry & cq_mask, cqe->user_data, cqe->res, cqe->flags); + } else { + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, " + "extra1:%llu, extra2:%llu\n", + entry & cq_mask, cqe->user_data, cqe->res, + cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]); + } } /* @@ -11032,7 +12266,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); + size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; @@ -11047,7 +12281,10 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, rings->sq_ring_entries = p->sq_entries; rings->cq_ring_entries = p->cq_entries; - size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); + if (p->flags & IORING_SETUP_SQE128) + size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); + else + size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { io_mem_free(ctx->rings); ctx->rings = NULL; @@ -11159,11 +12396,41 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ctx = io_ring_ctx_alloc(p); if (!ctx) return -ENOMEM; + + /* + * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user + * space applications don't need to do io completion events + * polling again, they can rely on io_sq_thread to do polling + * work, which can reduce cpu usage and uring_lock contention. + */ + if (ctx->flags & IORING_SETUP_IOPOLL && + !(ctx->flags & IORING_SETUP_SQPOLL)) + ctx->syscall_iopoll = 1; + ctx->compat = in_compat_syscall(); if (!capable(CAP_IPC_LOCK)) ctx->user = get_uid(current_user()); /* + * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if + * COOP_TASKRUN is set, then IPIs are never needed by the app. + */ + ret = -EINVAL; + if (ctx->flags & IORING_SETUP_SQPOLL) { + /* IPI related flags don't make sense with SQPOLL */ + if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_TASKRUN_FLAG)) + goto err; + ctx->notify_method = TWA_SIGNAL_NO_IPI; + } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { + ctx->notify_method = TWA_SIGNAL_NO_IPI; + } else { + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + goto err; + ctx->notify_method = TWA_SIGNAL; + } + + /* * This is just grabbed for accounting purposes. When a process exits, * the mm is exited and dropped before the files, hence we need to hang * on to this mm purely for the purposes of being able to unaccount @@ -11260,10 +12527,12 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | - IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL)) + IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | + IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | + IORING_SETUP_SQE128 | IORING_SETUP_CQE32)) return -EINVAL; - return io_uring_create(entries, &p, params); + return io_uring_create(entries, &p, params); } SYSCALL_DEFINE2(io_uring_setup, u32, entries, @@ -11476,14 +12745,20 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, memset(&rr, 0, sizeof(rr)); if (copy_from_user(&rr, arg, size)) return -EFAULT; - if (!rr.nr || rr.resv || rr.resv2) + if (!rr.nr || rr.resv2) + return -EINVAL; + if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) return -EINVAL; switch (type) { case IORING_RSRC_FILE: + if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) + break; return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); case IORING_RSRC_BUFFER: + if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) + break; return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); } @@ -11618,6 +12893,85 @@ err: return ret; } +static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_ring *br; + struct io_uring_buf_reg reg; + struct io_buffer_list *bl; + struct page **pages; + int nr_pages; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + + if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + if (!reg.ring_addr) + return -EFAULT; + if (reg.ring_addr & ~PAGE_MASK) + return -EINVAL; + if (!is_power_of_2(reg.ring_entries)) + return -EINVAL; + + if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { + int ret = io_init_bl_list(ctx); + if (ret) + return ret; + } + + bl = io_buffer_get_list(ctx, reg.bgid); + if (bl) { + /* if mapped buffer ring OR classic exists, don't allow */ + if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) + return -EEXIST; + } else { + bl = kzalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) + return -ENOMEM; + } + + pages = io_pin_pages(reg.ring_addr, + struct_size(br, bufs, reg.ring_entries), + &nr_pages); + if (IS_ERR(pages)) { + kfree(bl); + return PTR_ERR(pages); + } + + br = page_address(pages[0]); + bl->buf_pages = pages; + bl->buf_nr_pages = nr_pages; + bl->nr_entries = reg.ring_entries; + bl->buf_ring = br; + bl->mask = reg.ring_entries - 1; + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; +} + +static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_reg reg; + struct io_buffer_list *bl; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + + bl = io_buffer_get_list(ctx, reg.bgid); + if (!bl) + return -ENOENT; + if (!bl->buf_nr_pages) + return -EINVAL; + + __io_remove_buffers(ctx, bl, -1U); + if (bl->bgid >= BGID_ARRAY) { + xa_erase(&ctx->io_bl_xa, bl->bgid); + kfree(bl); + } + return 0; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -11643,6 +12997,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, switch (opcode) { case IORING_REGISTER_BUFFERS: + ret = -EFAULT; + if (!arg) + break; ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_BUFFERS: @@ -11652,6 +13009,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_sqe_buffers_unregister(ctx); break; case IORING_REGISTER_FILES: + ret = -EFAULT; + if (!arg) + break; ret = io_sqe_files_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_FILES: @@ -11746,6 +13106,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_UNREGISTER_RING_FDS: ret = io_ringfd_unregister(ctx, arg, nr_args); break; + case IORING_REGISTER_PBUF_RING: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_pbuf_ring(ctx, arg); + break; + case IORING_UNREGISTER_PBUF_RING: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_unregister_pbuf_ring(ctx, arg); + break; default: ret = -EINVAL; break; @@ -11822,6 +13194,7 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(42, __u16, personality); BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_SQE_ELEM(44, __u32, file_index); + BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != sizeof(struct io_uring_rsrc_update)); @@ -11830,6 +13203,10 @@ static int __init io_uring_init(void) /* ->buf_index is u16 */ BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); + BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE); + BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0); + BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != + offsetof(struct io_uring_buf_ring, tail)); /* should fit into one byte */ BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); @@ -11839,6 +13216,10 @@ static int __init io_uring_init(void) BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); + BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); + + BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64); + req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); return 0; diff --git a/fs/xattr.c b/fs/xattr.c index 998045165916..e8dd03e4561e 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -25,6 +25,8 @@ #include <linux/uaccess.h> +#include "internal.h" + static const char * strcmp_prefix(const char *a, const char *a_prefix) { @@ -539,44 +541,76 @@ EXPORT_SYMBOL_GPL(vfs_removexattr); /* * Extended attribute SET operations */ -static long -setxattr(struct user_namespace *mnt_userns, struct dentry *d, - const char __user *name, const void __user *value, size_t size, - int flags) + +int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) { int error; - void *kvalue = NULL; - char kname[XATTR_NAME_MAX + 1]; - if (flags & ~(XATTR_CREATE|XATTR_REPLACE)) + if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE)) return -EINVAL; - error = strncpy_from_user(kname, name, sizeof(kname)); - if (error == 0 || error == sizeof(kname)) - error = -ERANGE; + error = strncpy_from_user(ctx->kname->name, name, + sizeof(ctx->kname->name)); + if (error == 0 || error == sizeof(ctx->kname->name)) + return -ERANGE; if (error < 0) return error; - if (size) { - if (size > XATTR_SIZE_MAX) + error = 0; + if (ctx->size) { + if (ctx->size > XATTR_SIZE_MAX) return -E2BIG; - kvalue = kvmalloc(size, GFP_KERNEL); - if (!kvalue) - return -ENOMEM; - if (copy_from_user(kvalue, value, size)) { - error = -EFAULT; - goto out; + + ctx->kvalue = vmemdup_user(ctx->cvalue, ctx->size); + if (IS_ERR(ctx->kvalue)) { + error = PTR_ERR(ctx->kvalue); + ctx->kvalue = NULL; } - if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || - (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d), - kvalue, size); } - error = vfs_setxattr(mnt_userns, d, kname, kvalue, size, flags); -out: - kvfree(kvalue); + return error; +} + +static void setxattr_convert(struct user_namespace *mnt_userns, + struct dentry *d, struct xattr_ctx *ctx) +{ + if (ctx->size && + ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || + (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))) + posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d), + ctx->kvalue, ctx->size); +} + +int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct xattr_ctx *ctx) +{ + setxattr_convert(mnt_userns, dentry, ctx); + return vfs_setxattr(mnt_userns, dentry, ctx->kname->name, + ctx->kvalue, ctx->size, ctx->flags); +} + +static long +setxattr(struct user_namespace *mnt_userns, struct dentry *d, + const char __user *name, const void __user *value, size_t size, + int flags) +{ + struct xattr_name kname; + struct xattr_ctx ctx = { + .cvalue = value, + .kvalue = NULL, + .size = size, + .kname = &kname, + .flags = flags, + }; + int error; + error = setxattr_copy(name, &ctx); + if (error) + return error; + + error = do_setxattr(mnt_userns, d, &ctx); + + kvfree(ctx.kvalue); return error; } @@ -642,44 +676,61 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, /* * Extended attribute GET operations */ -static ssize_t -getxattr(struct user_namespace *mnt_userns, struct dentry *d, - const char __user *name, void __user *value, size_t size) +ssize_t +do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, + struct xattr_ctx *ctx) { ssize_t error; - void *kvalue = NULL; - char kname[XATTR_NAME_MAX + 1]; - - error = strncpy_from_user(kname, name, sizeof(kname)); - if (error == 0 || error == sizeof(kname)) - error = -ERANGE; - if (error < 0) - return error; + char *kname = ctx->kname->name; - if (size) { - if (size > XATTR_SIZE_MAX) - size = XATTR_SIZE_MAX; - kvalue = kvzalloc(size, GFP_KERNEL); - if (!kvalue) + if (ctx->size) { + if (ctx->size > XATTR_SIZE_MAX) + ctx->size = XATTR_SIZE_MAX; + ctx->kvalue = kvzalloc(ctx->size, GFP_KERNEL); + if (!ctx->kvalue) return -ENOMEM; } - error = vfs_getxattr(mnt_userns, d, kname, kvalue, size); + error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size); if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d), - kvalue, error); - if (size && copy_to_user(value, kvalue, error)) + ctx->kvalue, error); + if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; - } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) { + } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { /* The file system tried to returned a value bigger than XATTR_SIZE_MAX bytes. Not possible. */ error = -E2BIG; } - kvfree(kvalue); + return error; +} + +static ssize_t +getxattr(struct user_namespace *mnt_userns, struct dentry *d, + const char __user *name, void __user *value, size_t size) +{ + ssize_t error; + struct xattr_name kname; + struct xattr_ctx ctx = { + .value = value, + .kvalue = NULL, + .size = size, + .kname = &kname, + .flags = 0, + }; + + error = strncpy_from_user(kname.name, name, sizeof(kname.name)); + if (error == 0 || error == sizeof(kname.name)) + error = -ERANGE; + if (error < 0) + return error; + + error = do_getxattr(mnt_userns, d, &ctx); + kvfree(ctx.kvalue); return error; } diff --git a/include/linux/audit.h b/include/linux/audit.h index d06134ac6245..cece70231138 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -339,7 +339,7 @@ static inline void audit_uring_entry(u8 op) } static inline void audit_uring_exit(int success, long code) { - if (unlikely(!audit_dummy_context())) + if (unlikely(audit_context())) __audit_uring_exit(success, code); } static inline void audit_syscall_entry(int major, unsigned long a0, diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 3431011f364d..cba8a6ffc329 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -287,6 +287,9 @@ struct ceph_osd_linger_request { rados_watcherrcb_t errcb; void *data; + struct ceph_pagelist *request_pl; + struct page **notify_id_pages; + struct page ***preply_pages; size_t *preply_len; }; diff --git a/include/linux/efi.h b/include/linux/efi.h index ccd4d3f91c98..0412304ce34e 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -213,6 +213,8 @@ struct capsule_info { size_t page_bytes_remain; }; +int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, + size_t hdr_bytes); int __efi_capsule_setup_info(struct capsule_info *cap_info); /* @@ -383,6 +385,7 @@ void efi_native_runtime_setup(void); #define EFI_LOAD_FILE_PROTOCOL_GUID EFI_GUID(0x56ec3091, 0x954c, 0x11d2, 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b) #define EFI_LOAD_FILE2_PROTOCOL_GUID EFI_GUID(0x4006c0c1, 0xfcb3, 0x403e, 0x99, 0x6d, 0x4a, 0x6c, 0x87, 0x24, 0xe0, 0x6d) #define EFI_RT_PROPERTIES_TABLE_GUID EFI_GUID(0xeb66918a, 0x7eef, 0x402a, 0x84, 0x2e, 0x93, 0x1d, 0x21, 0xc3, 0x8a, 0xe9) +#define EFI_DXE_SERVICES_TABLE_GUID EFI_GUID(0x05ad34ba, 0x6f02, 0x4214, 0x95, 0x2e, 0x4d, 0xa0, 0x39, 0x8e, 0x2b, 0xb9) #define EFI_IMAGE_SECURITY_DATABASE_GUID EFI_GUID(0xd719b2cb, 0x3d3a, 0x4596, 0xa3, 0xbc, 0xda, 0xd0, 0x0e, 0x67, 0x65, 0x6f) #define EFI_SHIM_LOCK_GUID EFI_GUID(0x605dab50, 0xe046, 0x4300, 0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23) @@ -405,6 +408,20 @@ void efi_native_runtime_setup(void); #define LINUX_EFI_MEMRESERVE_TABLE_GUID EFI_GUID(0x888eb0c6, 0x8ede, 0x4ff5, 0xa8, 0xf0, 0x9a, 0xee, 0x5c, 0xb9, 0x77, 0xc2) #define LINUX_EFI_INITRD_MEDIA_GUID EFI_GUID(0x5568e427, 0x68fc, 0x4f3d, 0xac, 0x74, 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68) #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID EFI_GUID(0xc451ed2b, 0x9694, 0x45d3, 0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89) +#define LINUX_EFI_COCO_SECRET_AREA_GUID EFI_GUID(0xadf956ad, 0xe98c, 0x484c, 0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47) + +#define RISCV_EFI_BOOT_PROTOCOL_GUID EFI_GUID(0xccd15fec, 0x6f73, 0x4eec, 0x83, 0x95, 0x3e, 0x69, 0xe4, 0xb9, 0x40, 0xbf) + +/* + * This GUID may be installed onto the kernel image's handle as a NULL protocol + * to signal to the stub that the placement of the image should be respected, + * and moving the image in physical memory is undesirable. To ensure + * compatibility with 64k pages kernels with virtually mapped stacks, and to + * avoid defeating physical randomization, this protocol should only be + * installed if the image was placed at a randomized 128k aligned address in + * memory. + */ +#define LINUX_EFI_LOADED_IMAGE_FIXED_GUID EFI_GUID(0xf5a37b6d, 0x3344, 0x42a5, 0xb6, 0xbb, 0x97, 0x86, 0x48, 0xc1, 0x89, 0x0a) /* OEM GUIDs */ #define DELLEMC_EFI_RCI2_TABLE_GUID EFI_GUID(0x2d9f28a2, 0xa886, 0x456a, 0x97, 0xa8, 0xf1, 0x1e, 0xf2, 0x4f, 0xf4, 0x55) @@ -435,6 +452,7 @@ typedef struct { } efi_config_table_type_t; #define EFI_SYSTEM_TABLE_SIGNATURE ((u64)0x5453595320494249ULL) +#define EFI_DXE_SERVICES_TABLE_SIGNATURE ((u64)0x565245535f455844ULL) #define EFI_2_30_SYSTEM_TABLE_REVISION ((2 << 16) | (30)) #define EFI_2_20_SYSTEM_TABLE_REVISION ((2 << 16) | (20)) @@ -596,6 +614,7 @@ extern struct efi { unsigned long tpm_log; /* TPM2 Event Log table */ unsigned long tpm_final_log; /* TPM2 Final Events Log table */ unsigned long mokvar_table; /* MOK variable config table */ + unsigned long coco_secret; /* Confidential computing secret table */ efi_get_time_t *get_time; efi_set_time_t *set_time; @@ -1335,4 +1354,12 @@ extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt); static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) { } #endif +struct linux_efi_coco_secret_area { + u64 base_pa; + u64 size; +}; + +/* Header of a populated EFI secret area */ +#define EFI_SECRET_TABLE_HEADER_GUID EFI_GUID(0x1e74f542, 0x71dd, 0x4d66, 0x96, 0x3e, 0xef, 0x42, 0x87, 0xff, 0x17, 0x3b) + #endif /* _LINUX_EFI_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..87b5af1d9fbe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1953,6 +1953,7 @@ struct dir_context { #define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN) struct iov_iter; +struct io_uring_cmd; struct file_operations { struct module *owner; @@ -1995,6 +1996,7 @@ struct file_operations { struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); + int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); } __randomize_layout; struct inode_operations { diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 1814e698d861..4a2f6cc5a492 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -5,11 +5,37 @@ #include <linux/sched.h> #include <linux/xarray.h> +enum io_uring_cmd_flags { + IO_URING_F_COMPLETE_DEFER = 1, + IO_URING_F_UNLOCKED = 2, + /* int's last bit, sign checks are usually faster than a bit test */ + IO_URING_F_NONBLOCK = INT_MIN, + + /* ctx state flags, for URING_CMD */ + IO_URING_F_SQE128 = 4, + IO_URING_F_CQE32 = 8, + IO_URING_F_IOPOLL = 16, +}; + +struct io_uring_cmd { + struct file *file; + const void *cmd; + /* callback to defer completions to task context */ + void (*task_work_cb)(struct io_uring_cmd *cmd); + u32 cmd_op; + u32 pad; + u8 pdu[32]; /* available inline for free use */ +}; + #if defined(CONFIG_IO_URING) +void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2); +void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *)); struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); +const char *io_uring_get_opcode(u8 opcode); static inline void io_uring_files_cancel(void) { @@ -29,6 +55,14 @@ static inline void io_uring_free(struct task_struct *tsk) __io_uring_free(tsk); } #else +static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, + ssize_t ret2) +{ +} +static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *)) +{ +} static inline struct sock *io_uring_get_socket(struct file *file) { return NULL; @@ -42,6 +76,10 @@ static inline void io_uring_files_cancel(void) static inline void io_uring_free(struct task_struct *tsk) { } +static inline const char *io_uring_get_opcode(u8 opcode) +{ + return ""; +} #endif #endif diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b1fbe21650bb..f736c020cde2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -900,7 +900,7 @@ struct net_device_path_stack { struct net_device_path_ctx { const struct net_device *dev; - const u8 *daddr; + u8 daddr[ETH_ALEN]; int num_vlans; struct { diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index e7c39c200e2b..1a32036c918c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -196,6 +196,7 @@ void synchronize_rcu_tasks_rude(void); void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +#define rcu_tasks_classic_qs(t, preempt) do { } while (0) #define rcu_tasks_qs(t, preempt) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu diff --git a/include/linux/sched.h b/include/linux/sched.h index a8911b1f35aa..b3278f8184d5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2118,6 +2118,47 @@ static inline void cond_resched_rcu(void) #endif } +#ifdef CONFIG_PREEMPT_DYNAMIC + +extern bool preempt_model_none(void); +extern bool preempt_model_voluntary(void); +extern bool preempt_model_full(void); + +#else + +static inline bool preempt_model_none(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_NONE); +} +static inline bool preempt_model_voluntary(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY); +} +static inline bool preempt_model_full(void) +{ + return IS_ENABLED(CONFIG_PREEMPT); +} + +#endif + +static inline bool preempt_model_rt(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_RT); +} + +/* + * Does the preemption model allow non-cooperative preemption? + * + * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with + * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the + * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the + * PREEMPT_NONE model. + */ +static inline bool preempt_model_preemptible(void) +{ + return preempt_model_full() || preempt_model_rt(); +} + /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPTION, diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3c8b34876744..66b689f6cfcb 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -356,13 +356,22 @@ static inline void clear_notify_signal(void) } /* + * Returns 'true' if kick_process() is needed to force a transition from + * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work. + */ +static inline bool __set_notify_signal(struct task_struct *task) +{ + return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && + !wake_up_state(task, TASK_INTERRUPTIBLE); +} + +/* * Called to break out of interruptible wait loops, and enter the * exit_to_user_mode_loop(). */ static inline void set_notify_signal(struct task_struct *task) { - if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && - !wake_up_state(task, TASK_INTERRUPTIBLE)) + if (__set_notify_signal(task)) kick_process(task); } diff --git a/include/linux/socket.h b/include/linux/socket.h index 6f85f5d957ef..17311ad9f9af 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -50,6 +50,9 @@ struct linger { struct msghdr { void *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ + + int msg_inq; /* output, data left in socket */ + struct iov_iter msg_iter; /* data */ /* @@ -62,8 +65,9 @@ struct msghdr { void __user *msg_control_user; }; bool msg_control_is_user : 1; - __kernel_size_t msg_controllen; /* ancillary data buffer length */ + bool msg_get_inq : 1;/* return INQ after receive */ unsigned int msg_flags; /* flags on received message */ + __kernel_size_t msg_controllen; /* ancillary data buffer length */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ }; @@ -434,6 +438,7 @@ extern struct file *do_accept(struct file *file, unsigned file_flags, extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_socket(int family, int type, int protocol); +extern struct file *__sys_socket_file(int family, int type, int protocol); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, int addrlen, int file_flags); diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index cb1f4351e8ba..e3014319d1ad 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -47,11 +47,9 @@ struct srcu_data { */ struct srcu_node { spinlock_t __private lock; - unsigned long srcu_have_cbs[4]; /* GP seq for children */ - /* having CBs, but only */ - /* is > ->srcu_gq_seq. */ - unsigned long srcu_data_have_cbs[4]; /* Which srcu_data structs */ - /* have CBs for given GP? */ + unsigned long srcu_have_cbs[4]; /* GP seq for children having CBs, but only */ + /* if greater than ->srcu_gq_seq. */ + unsigned long srcu_data_have_cbs[4]; /* Which srcu_data structs have CBs for given GP? */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ struct srcu_node *srcu_parent; /* Next up in tree. */ int grplo; /* Least CPU for node. */ @@ -62,18 +60,24 @@ struct srcu_node { * Per-SRCU-domain structure, similar in function to rcu_state. */ struct srcu_struct { - struct srcu_node node[NUM_RCU_NODES]; /* Combining tree. */ + struct srcu_node *node; /* Combining tree. */ struct srcu_node *level[RCU_NUM_LVLS + 1]; /* First node at each level. */ + int srcu_size_state; /* Small-to-big transition state. */ struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ - spinlock_t __private lock; /* Protect counters */ + spinlock_t __private lock; /* Protect counters and size state. */ struct mutex srcu_gp_mutex; /* Serialize GP work. */ unsigned int srcu_idx; /* Current rdr array element. */ unsigned long srcu_gp_seq; /* Grace-period seq #. */ unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ + unsigned long srcu_gp_start; /* Last GP start timestamp (jiffies) */ unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ + unsigned long srcu_size_jiffies; /* Current contention-measurement interval. */ + unsigned long srcu_n_lock_retries; /* Contention events in current interval. */ + unsigned long srcu_n_exp_nodelay; /* # expedited no-delays in current GP phase. */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ + bool sda_is_static; /* May ->sda be passed to free_percpu()? */ unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ struct completion srcu_barrier_completion; @@ -81,10 +85,23 @@ struct srcu_struct { atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */ /* callback for the barrier */ /* operation. */ + unsigned long reschedule_jiffies; + unsigned long reschedule_count; struct delayed_work work; struct lockdep_map dep_map; }; +/* Values for size state variable (->srcu_size_state). */ +#define SRCU_SIZE_SMALL 0 +#define SRCU_SIZE_ALLOC 1 +#define SRCU_SIZE_WAIT_BARRIER 2 +#define SRCU_SIZE_WAIT_CALL 3 +#define SRCU_SIZE_WAIT_CBS1 4 +#define SRCU_SIZE_WAIT_CBS2 5 +#define SRCU_SIZE_WAIT_CBS3 6 +#define SRCU_SIZE_WAIT_CBS4 7 +#define SRCU_SIZE_BIG 8 + /* Values for state variable (bottom bits of ->srcu_gp_seq). */ #define SRCU_STATE_IDLE 0 #define SRCU_STATE_SCAN1 1 @@ -121,6 +138,7 @@ struct srcu_struct { #ifdef MODULE # define __DEFINE_SRCU(name, is_static) \ is_static struct srcu_struct name; \ + extern struct srcu_struct * const __srcu_struct_##name; \ struct srcu_struct * const __srcu_struct_##name \ __section("___srcu_struct_ptrs") = &name #else diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 897494b597ba..795ef5a68429 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -17,6 +17,7 @@ enum task_work_notify_mode { TWA_NONE, TWA_RESUME, TWA_SIGNAL, + TWA_SIGNAL_NO_IPI, }; static inline bool task_work_pending(struct task_struct *task) diff --git a/include/linux/torture.h b/include/linux/torture.h index 63fa4196e51c..7038104463e4 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -118,7 +118,7 @@ void _torture_stop_kthread(char *m, struct task_struct **tp); _torture_stop_kthread("Stopping " #n " task", &(tp)) #ifdef CONFIG_PREEMPTION -#define torture_preempt_schedule() preempt_schedule() +#define torture_preempt_schedule() __preempt_schedule() #else #define torture_preempt_schedule() do { } while (0) #endif diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 463ae5d33eb0..5b47545f22d3 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -71,7 +71,6 @@ struct inet_timewait_sock { tw_tos : 8; u32 tw_txhash; u32 tw_priority; - u32 tw_bslot; /* bind bucket slot */ struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; }; @@ -110,6 +109,8 @@ static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo void inet_twsk_deschedule_put(struct inet_timewait_sock *tw); +void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family); + static inline struct net *twsk_net(const struct inet_timewait_sock *twsk) { diff --git a/include/net/ip.h b/include/net/ip.h index 3984f2c39c4b..0161137914cf 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -56,6 +56,7 @@ struct inet_skb_parm { #define IPSKB_DOREDIRECT BIT(5) #define IPSKB_FRAG_PMTU BIT(6) #define IPSKB_L3SLAVE BIT(7) +#define IPSKB_NOPOLICY BIT(8) u16 frag_max_size; }; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 6fb899ff5afc..d2efddce65d4 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1093,6 +1093,18 @@ static inline bool __xfrm_check_nopolicy(struct net *net, struct sk_buff *skb, return false; } +static inline bool __xfrm_check_dev_nopolicy(struct sk_buff *skb, + int dir, unsigned short family) +{ + if (dir != XFRM_POLICY_OUT && family == AF_INET) { + /* same dst may be used for traffic originating from + * devices with different policy settings. + */ + return IPCB(skb)->flags & IPSKB_NOPOLICY; + } + return skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY); +} + static inline int __xfrm_policy_check2(struct sock *sk, int dir, struct sk_buff *skb, unsigned int family, int reverse) @@ -1104,7 +1116,7 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir, return __xfrm_policy_check(sk, ndir, skb, family); return __xfrm_check_nopolicy(net, skb, dir) || - (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) || + __xfrm_check_dev_nopolicy(skb, dir, family) || __xfrm_policy_check(sk, ndir, skb, family); } diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index cddf5b6fbeb4..66fcc5a1a5b1 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -7,6 +7,7 @@ #include <linux/tracepoint.h> #include <uapi/linux/io_uring.h> +#include <linux/io_uring.h> struct io_wq_work; @@ -147,7 +148,7 @@ TRACE_EVENT(io_uring_queue_async_work, TP_PROTO(void *ctx, void * req, unsigned long long user_data, u8 opcode, unsigned int flags, struct io_wq_work *work, int rw), - TP_ARGS(ctx, req, user_data, flags, opcode, work, rw), + TP_ARGS(ctx, req, user_data, opcode, flags, work, rw), TP_STRUCT__entry ( __field( void *, ctx ) @@ -169,8 +170,9 @@ TRACE_EVENT(io_uring_queue_async_work, __entry->rw = rw; ), - TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d, flags 0x%x, %s queue, work %p", - __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, + TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p", + __entry->ctx, __entry->req, __entry->user_data, + io_uring_get_opcode(__entry->opcode), __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) ); @@ -205,8 +207,9 @@ TRACE_EVENT(io_uring_defer, __entry->opcode = opcode; ), - TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d", - __entry->ctx, __entry->req, __entry->data, __entry->opcode) + TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s", + __entry->ctx, __entry->req, __entry->data, + io_uring_get_opcode(__entry->opcode)) ); /** @@ -305,9 +308,9 @@ TRACE_EVENT(io_uring_fail_link, __entry->link = link; ), - TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d, link %p", - __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, - __entry->link) + TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p", + __entry->ctx, __entry->req, __entry->user_data, + io_uring_get_opcode(__entry->opcode), __entry->link) ); /** @@ -318,13 +321,16 @@ TRACE_EVENT(io_uring_fail_link, * @user_data: user data associated with the request * @res: result of the request * @cflags: completion flags + * @extra1: extra 64-bit data for CQE32 + * @extra2: extra 64-bit data for CQE32 * */ TRACE_EVENT(io_uring_complete, - TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags), + TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags, + u64 extra1, u64 extra2), - TP_ARGS(ctx, req, user_data, res, cflags), + TP_ARGS(ctx, req, user_data, res, cflags, extra1, extra2), TP_STRUCT__entry ( __field( void *, ctx ) @@ -332,6 +338,8 @@ TRACE_EVENT(io_uring_complete, __field( u64, user_data ) __field( int, res ) __field( unsigned, cflags ) + __field( u64, extra1 ) + __field( u64, extra2 ) ), TP_fast_assign( @@ -340,12 +348,17 @@ TRACE_EVENT(io_uring_complete, __entry->user_data = user_data; __entry->res = res; __entry->cflags = cflags; + __entry->extra1 = extra1; + __entry->extra2 = extra2; ), - TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x", + TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x " + "extra1 %llu extra2 %llu ", __entry->ctx, __entry->req, __entry->user_data, - __entry->res, __entry->cflags) + __entry->res, __entry->cflags, + (unsigned long long) __entry->extra1, + (unsigned long long) __entry->extra2) ); /** @@ -389,9 +402,9 @@ TRACE_EVENT(io_uring_submit_sqe, __entry->sq_thread = sq_thread; ), - TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, flags 0x%x, " + TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, " "non block %d, sq_thread %d", __entry->ctx, __entry->req, - __entry->user_data, __entry->opcode, + __entry->user_data, io_uring_get_opcode(__entry->opcode), __entry->flags, __entry->force_nonblock, __entry->sq_thread) ); @@ -433,8 +446,9 @@ TRACE_EVENT(io_uring_poll_arm, __entry->events = events; ), - TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, mask 0x%x, events 0x%x", - __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, + TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x", + __entry->ctx, __entry->req, __entry->user_data, + io_uring_get_opcode(__entry->opcode), __entry->mask, __entry->events) ); @@ -470,8 +484,9 @@ TRACE_EVENT(io_uring_task_add, __entry->mask = mask; ), - TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, mask %x", - __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, + TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x", + __entry->ctx, __entry->req, __entry->user_data, + io_uring_get_opcode(__entry->opcode), __entry->mask) ); @@ -506,7 +521,7 @@ TRACE_EVENT(io_uring_req_failed, __field( u16, personality ) __field( u32, file_index ) __field( u64, pad1 ) - __field( u64, pad2 ) + __field( u64, addr3 ) __field( int, error ) ), @@ -520,27 +535,69 @@ TRACE_EVENT(io_uring_req_failed, __entry->off = sqe->off; __entry->addr = sqe->addr; __entry->len = sqe->len; - __entry->op_flags = sqe->rw_flags; + __entry->op_flags = sqe->poll32_events; __entry->buf_index = sqe->buf_index; __entry->personality = sqe->personality; __entry->file_index = sqe->file_index; __entry->pad1 = sqe->__pad2[0]; - __entry->pad2 = sqe->__pad2[1]; + __entry->addr3 = sqe->addr3; __entry->error = error; ), TP_printk("ring %p, req %p, user_data 0x%llx, " - "op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, " + "opcode %s, flags 0x%x, prio=%d, off=%llu, addr=%llu, " "len=%u, rw_flags=0x%x, buf_index=%d, " - "personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d", + "personality=%d, file_index=%d, pad=0x%llx, addr3=%llx, " + "error=%d", __entry->ctx, __entry->req, __entry->user_data, - __entry->opcode, __entry->flags, __entry->ioprio, + io_uring_get_opcode(__entry->opcode), + __entry->flags, __entry->ioprio, (unsigned long long)__entry->off, (unsigned long long) __entry->addr, __entry->len, __entry->op_flags, __entry->buf_index, __entry->personality, __entry->file_index, (unsigned long long) __entry->pad1, - (unsigned long long) __entry->pad2, __entry->error) + (unsigned long long) __entry->addr3, __entry->error) +); + + +/* + * io_uring_cqe_overflow - a CQE overflowed + * + * @ctx: pointer to a ring context structure + * @user_data: user data associated with the request + * @res: CQE result + * @cflags: CQE flags + * @ocqe: pointer to the overflow cqe (if available) + * + */ +TRACE_EVENT(io_uring_cqe_overflow, + + TP_PROTO(void *ctx, unsigned long long user_data, s32 res, u32 cflags, + void *ocqe), + + TP_ARGS(ctx, user_data, res, cflags, ocqe), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( unsigned long long, user_data ) + __field( s32, res ) + __field( u32, cflags ) + __field( void *, ocqe ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->user_data = user_data; + __entry->res = res; + __entry->cflags = cflags; + __entry->ocqe = ocqe; + ), + + TP_printk("ring %p, user_data 0x%llx, res %d, flags %x, " + "overflow_cqe %p", + __entry->ctx, __entry->user_data, __entry->res, + __entry->cflags, __entry->ocqe) ); #endif /* _TRACE_IO_URING_H */ diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h index 8e4a2ca0bcbf..b1523cb8ab30 100644 --- a/include/uapi/linux/dma-buf.h +++ b/include/uapi/linux/dma-buf.h @@ -92,7 +92,7 @@ struct dma_buf_sync { * between them in actual uapi, they're just different numbers. */ #define DMA_BUF_SET_NAME _IOW(DMA_BUF_BASE, 1, const char *) -#define DMA_BUF_SET_NAME_A _IOW(DMA_BUF_BASE, 1, u32) -#define DMA_BUF_SET_NAME_B _IOW(DMA_BUF_BASE, 1, u64) +#define DMA_BUF_SET_NAME_A _IOW(DMA_BUF_BASE, 1, __u32) +#define DMA_BUF_SET_NAME_B _IOW(DMA_BUF_BASE, 1, __u64) #endif diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1845cf7c80ba..53e7dae92e42 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -22,6 +22,7 @@ struct io_uring_sqe { union { __u64 off; /* offset into file */ __u64 addr2; + __u32 cmd_op; }; union { __u64 addr; /* pointer to buffer or iovecs */ @@ -45,6 +46,7 @@ struct io_uring_sqe { __u32 rename_flags; __u32 unlink_flags; __u32 hardlink_flags; + __u32 xattr_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -60,9 +62,28 @@ struct io_uring_sqe { __s32 splice_fd_in; __u32 file_index; }; - __u64 __pad2[2]; + union { + struct { + __u64 addr3; + __u64 __pad2[1]; + }; + /* + * If the ring is initialized with IORING_SETUP_SQE128, then + * this field is used for 80 bytes of arbitrary command data + */ + __u8 cmd[0]; + }; }; +/* + * If sqe->file_index is set to this for opcodes that instantiate a new + * direct descriptor (like openat/openat2/accept), then io_uring will allocate + * an available direct descriptor instead of having the application pass one + * in. The picked direct descriptor will be returned in cqe->res, or -ENFILE + * if the space is full. + */ +#define IORING_FILE_INDEX_ALLOC (~0U) + enum { IOSQE_FIXED_FILE_BIT, IOSQE_IO_DRAIN_BIT, @@ -102,8 +123,25 @@ enum { #define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ #define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ #define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */ +/* + * Cooperative task running. When requests complete, they often require + * forcing the submitter to transition to the kernel to complete. If this + * flag is set, work will be done when the task transitions anyway, rather + * than force an inter-processor interrupt reschedule. This avoids interrupting + * a task running in userspace, and saves an IPI. + */ +#define IORING_SETUP_COOP_TASKRUN (1U << 8) +/* + * If COOP_TASKRUN is set, get notified if task work is available for + * running and a kernel transition would be needed to run it. This sets + * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. + */ +#define IORING_SETUP_TASKRUN_FLAG (1U << 9) -enum { +#define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */ +#define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */ + +enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, IORING_OP_WRITEV, @@ -145,6 +183,12 @@ enum { IORING_OP_SYMLINKAT, IORING_OP_LINKAT, IORING_OP_MSG_RING, + IORING_OP_FSETXATTR, + IORING_OP_SETXATTR, + IORING_OP_FGETXATTR, + IORING_OP_GETXATTR, + IORING_OP_SOCKET, + IORING_OP_URING_CMD, /* this goes last, obviously */ IORING_OP_LAST, @@ -188,12 +232,45 @@ enum { #define IORING_POLL_UPDATE_USER_DATA (1U << 2) /* + * ASYNC_CANCEL flags. + * + * IORING_ASYNC_CANCEL_ALL Cancel all requests that match the given key + * IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the + * request 'user_data' + * IORING_ASYNC_CANCEL_ANY Match any request + */ +#define IORING_ASYNC_CANCEL_ALL (1U << 0) +#define IORING_ASYNC_CANCEL_FD (1U << 1) +#define IORING_ASYNC_CANCEL_ANY (1U << 2) + +/* + * send/sendmsg and recv/recvmsg flags (sqe->addr2) + * + * IORING_RECVSEND_POLL_FIRST If set, instead of first attempting to send + * or receive and arm poll if that yields an + * -EAGAIN result, arm poll upfront and skip + * the initial transfer attempt. + */ +#define IORING_RECVSEND_POLL_FIRST (1U << 0) + +/* + * accept flags stored in sqe->ioprio + */ +#define IORING_ACCEPT_MULTISHOT (1U << 0) + +/* * IO completion data structure (Completion Queue Entry) */ struct io_uring_cqe { __u64 user_data; /* sqe->data submission passed back */ __s32 res; /* result code for this event */ __u32 flags; + + /* + * If the ring is initialized with IORING_SETUP_CQE32, then this field + * contains 16-bytes of padding, doubling the size of the CQE. + */ + __u64 big_cqe[]; }; /* @@ -201,9 +278,11 @@ struct io_uring_cqe { * * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID * IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries + * IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv */ #define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_MORE (1U << 1) +#define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) enum { IORING_CQE_BUFFER_SHIFT = 16, @@ -236,6 +315,7 @@ struct io_sqring_offsets { */ #define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */ #define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */ +#define IORING_SQ_TASKRUN (1U << 2) /* task should enter the kernel */ struct io_cqring_offsets { __u32 head; @@ -333,6 +413,10 @@ enum { IORING_REGISTER_RING_FDS = 20, IORING_UNREGISTER_RING_FDS = 21, + /* register ring based provide buffer group */ + IORING_REGISTER_PBUF_RING = 22, + IORING_UNREGISTER_PBUF_RING = 23, + /* this goes last */ IORING_REGISTER_LAST }; @@ -350,9 +434,15 @@ struct io_uring_files_update { __aligned_u64 /* __s32 * */ fds; }; +/* + * Register a fully sparse file space, rather than pass in an array of all + * -1 file descriptors. + */ +#define IORING_RSRC_REGISTER_SPARSE (1U << 0) + struct io_uring_rsrc_register { __u32 nr; - __u32 resv; + __u32 flags; __u64 resv2; __aligned_u64 data; __aligned_u64 tags; @@ -404,6 +494,38 @@ struct io_uring_restriction { __u32 resv2[3]; }; +struct io_uring_buf { + __u64 addr; + __u32 len; + __u16 bid; + __u16 resv; +}; + +struct io_uring_buf_ring { + union { + /* + * To avoid spilling into more pages than we need to, the + * ring tail is overlaid with the io_uring_buf->resv field. + */ + struct { + __u64 resv1; + __u32 resv2; + __u16 resv3; + __u16 tail; + }; + struct io_uring_buf bufs[0]; + }; +}; + +/* argument for IORING_(UN)REGISTER_PBUF_RING */ +struct io_uring_buf_reg { + __u64 ring_addr; + __u32 ring_entries; + __u16 bgid; + __u16 pad; + __u64 resv[3]; +}; + /* * io_uring_restriction->opcode values */ diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h index b2e43185e3b5..2f76cba67166 100644 --- a/include/uapi/linux/nvme_ioctl.h +++ b/include/uapi/linux/nvme_ioctl.h @@ -70,6 +70,28 @@ struct nvme_passthru_cmd64 { __u64 result; }; +/* same as struct nvme_passthru_cmd64, minus the 8b result field */ +struct nvme_uring_cmd { + __u8 opcode; + __u8 flags; + __u16 rsvd1; + __u32 nsid; + __u32 cdw2; + __u32 cdw3; + __u64 metadata; + __u64 addr; + __u32 metadata_len; + __u32 data_len; + __u32 cdw10; + __u32 cdw11; + __u32 cdw12; + __u32 cdw13; + __u32 cdw14; + __u32 cdw15; + __u32 timeout_ms; + __u32 rsvd2; +}; + #define nvme_admin_cmd nvme_passthru_cmd #define NVME_IOCTL_ID _IO('N', 0x40) @@ -83,4 +105,10 @@ struct nvme_passthru_cmd64 { #define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64) #define NVME_IOCTL_IO64_CMD_VEC _IOWR('N', 0x49, struct nvme_passthru_cmd64) +/* io_uring async commands: */ +#define NVME_URING_CMD_IO _IOWR('N', 0x80, struct nvme_uring_cmd) +#define NVME_URING_CMD_IO_VEC _IOWR('N', 0x81, struct nvme_uring_cmd) +#define NVME_URING_CMD_ADMIN _IOWR('N', 0x82, struct nvme_uring_cmd) +#define NVME_URING_CMD_ADMIN_VEC _IOWR('N', 0x83, struct nvme_uring_cmd) + #endif /* _UAPI_LINUX_NVME_IOCTL_H */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ea2ee1181921..f3a2abd6d1a1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1959,6 +1959,12 @@ void __audit_uring_exit(int success, long code) { struct audit_context *ctx = audit_context(); + if (ctx->dummy) { + if (ctx->context != AUDIT_CTX_URING) + return; + goto out; + } + if (ctx->context == AUDIT_CTX_SYSCALL) { /* * NOTE: See the note in __audit_uring_entry() about the case diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index d56ee177d5f8..2dfe1079f772 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -27,6 +27,7 @@ config BPF_SYSCALL bool "Enable bpf() system call" select BPF select IRQ_WORK + select TASKS_RCU if PREEMPTION select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET diff --git a/kernel/events/core.c b/kernel/events/core.c index 7858bafffa9d..7f1e4c5897e7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -12217,6 +12217,9 @@ SYSCALL_DEFINE5(perf_event_open, * Do not allow to attach to a group in a different task * or CPU context. If we're moving SW events, we'll fix * this up later, so allow that. + * + * Racy, not holding group_leader->ctx->mutex, see comment with + * perf_event_ctx_lock(). */ if (!move_group && group_leader->ctx != ctx) goto err_context; @@ -12282,6 +12285,7 @@ SYSCALL_DEFINE5(perf_event_open, } else { perf_event_ctx_unlock(group_leader, gctx); move_group = 0; + goto not_move_group; } } @@ -12298,7 +12302,17 @@ SYSCALL_DEFINE5(perf_event_open, } } else { mutex_lock(&ctx->mutex); + + /* + * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx, + * see the group_leader && !move_group test earlier. + */ + if (group_leader && group_leader->ctx != ctx) { + err = -EINVAL; + goto err_locked; + } } +not_move_group: if (ctx->task == TASK_TOMBSTONE) { err = -ESRCH; diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index bf8e341e75b4..1c630e573548 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -77,31 +77,56 @@ config TASKS_RCU_GENERIC This option enables generic infrastructure code supporting task-based RCU implementations. Not for manual selection. +config FORCE_TASKS_RCU + bool "Force selection of TASKS_RCU" + depends on RCU_EXPERT + select TASKS_RCU + default n + help + This option force-enables a task-based RCU implementation + that uses only voluntary context switch (not preemption!), + idle, and user-mode execution as quiescent states. Not for + manual selection in most cases. + config TASKS_RCU - def_bool PREEMPTION + bool + default n + select IRQ_WORK + +config FORCE_TASKS_RUDE_RCU + bool "Force selection of Tasks Rude RCU" + depends on RCU_EXPERT + select TASKS_RUDE_RCU + default n help - This option enables a task-based RCU implementation that uses - only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. Not for manual selection. + This option force-enables a task-based RCU implementation + that uses only context switch (including preemption) and + user-mode execution as quiescent states. It forces IPIs and + context switches on all online CPUs, including idle ones, + so use with caution. Not for manual selection in most cases. config TASKS_RUDE_RCU - def_bool 0 + bool + default n + select IRQ_WORK + +config FORCE_TASKS_TRACE_RCU + bool "Force selection of Tasks Trace RCU" + depends on RCU_EXPERT + select TASKS_TRACE_RCU + default n help This option enables a task-based RCU implementation that uses - only context switch (including preemption) and user-mode - execution as quiescent states. It forces IPIs and context - switches on all online CPUs, including idle ones, so use - with caution. + explicit rcu_read_lock_trace() read-side markers, and allows + these readers to appear in the idle loop as well as on the + CPU hotplug code paths. It can force IPIs on online CPUs, + including idle ones, so use with caution. Not for manual + selection in most cases. config TASKS_TRACE_RCU - def_bool 0 + bool + default n select IRQ_WORK - help - This option enables a task-based RCU implementation that uses - explicit rcu_read_lock_trace() read-side markers, and allows - these readers to appear in the idle loop as well as on the CPU - hotplug code paths. It can force IPIs on online CPUs, including - idle ones, so use with caution. config RCU_STALL_COMMON def_bool TREE_RCU @@ -195,6 +220,20 @@ config RCU_BOOST_DELAY Accept the default if unsure. +config RCU_EXP_KTHREAD + bool "Perform RCU expedited work in a real-time kthread" + depends on RCU_BOOST && RCU_EXPERT + default !PREEMPT_RT && NR_CPUS <= 32 + help + Use this option to further reduce the latencies of expedited + grace periods at the expense of being more disruptive. + + This option is disabled by default on PREEMPT_RT=y kernels which + disable expedited grace periods after boot by unconditionally + setting rcupdate.rcu_normal_after_boot=1. + + Accept the default if unsure. + config RCU_NOCB_CPU bool "Offload RCU callback processing from boot-selected CPUs" depends on TREE_RCU @@ -225,7 +264,7 @@ config RCU_NOCB_CPU config TASKS_TRACE_RCU_READ_MB bool "Tasks Trace RCU readers use memory barriers in user and idle" - depends on RCU_EXPERT + depends on RCU_EXPERT && TASKS_TRACE_RCU default PREEMPT_RT || NR_CPUS < 8 help Use this option to further reduce the number of IPIs sent diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 4fd64999300f..9b64e55d4f61 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -28,9 +28,6 @@ config RCU_SCALE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU - select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance @@ -47,9 +44,6 @@ config RCU_TORTURE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU - select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs torture tests @@ -66,9 +60,6 @@ config RCU_REF_SCALE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU - select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance tests @@ -91,6 +82,20 @@ config RCU_CPU_STALL_TIMEOUT RCU grace period persists, additional CPU stall warnings are printed at more widely spaced intervals. +config RCU_EXP_CPU_STALL_TIMEOUT + int "Expedited RCU CPU stall timeout in milliseconds" + depends on RCU_STALL_COMMON + range 0 21000 + default 20 if ANDROID + default 0 if !ANDROID + help + If a given expedited RCU grace period extends more than the + specified number of milliseconds, a CPU stall warning is printed. + If the RCU grace period persists, additional CPU stall warnings + are printed at more widely spaced intervals. A value of zero + says to use the RCU_CPU_STALL_TIMEOUT value converted from + seconds to milliseconds. + config RCU_TRACE bool "Enable tracing for RCU" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 24b5f2c2de87..152492d52715 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -210,7 +210,9 @@ static inline bool rcu_stall_is_suppressed_at_boot(void) extern int rcu_cpu_stall_ftrace_dump; extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; +extern int rcu_exp_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); +int rcu_exp_jiffies_till_stall_check(void); static inline bool rcu_stall_is_suppressed(void) { @@ -523,6 +525,8 @@ static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { ret static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } static inline void rcu_fwd_progress_check(unsigned long j) { } +static inline void rcu_gp_slow_register(atomic_t *rgssp) { } +static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); @@ -534,14 +538,19 @@ int rcu_get_gp_kthreads_prio(void); void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; +#ifdef CONFIG_RCU_EXP_KTHREAD +extern struct kthread_worker *rcu_exp_gp_kworker; +extern struct kthread_worker *rcu_exp_par_gp_kworker; +#else /* !CONFIG_RCU_EXP_KTHREAD */ extern struct workqueue_struct *rcu_par_gp_wq; +#endif /* CONFIG_RCU_EXP_KTHREAD */ +void rcu_gp_slow_register(atomic_t *rgssp); +void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU -bool rcu_is_nocb_cpu(int cpu); void rcu_bind_current_to_nocb(void); #else -static inline bool rcu_is_nocb_cpu(int cpu) { return false; } static inline void rcu_bind_current_to_nocb(void) { } #endif diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 81145c3ece25..c54ea2b6a36b 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -505,10 +505,10 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]); /* - * Callbacks moved, so clean up the misordered ->tails[] pointers - * that now point into the middle of the list of ready-to-invoke - * callbacks. The overall effect is to copy down the later pointers - * into the gap that was created by the now-ready segments. + * Callbacks moved, so there might be an empty RCU_WAIT_TAIL + * and a non-empty RCU_NEXT_READY_TAIL. If so, copy the + * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap + * created by the now-ready-to-invoke segments. */ for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 5e4f1f83d38e..277a5bfb37d4 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -268,6 +268,8 @@ static struct rcu_scale_ops srcud_ops = { .name = "srcud" }; +#ifdef CONFIG_TASKS_RCU + /* * Definitions for RCU-tasks scalability testing. */ @@ -295,6 +297,16 @@ static struct rcu_scale_ops tasks_ops = { .name = "tasks" }; +#define TASKS_OPS &tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define TASKS_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RCU + +#ifdef CONFIG_TASKS_TRACE_RCU + /* * Definitions for RCU-tasks-trace scalability testing. */ @@ -324,6 +336,14 @@ static struct rcu_scale_ops tasks_tracing_ops = { .name = "tasks-tracing" }; +#define TASKS_TRACING_OPS &tasks_tracing_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define TASKS_TRACING_OPS + +#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU + static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -797,7 +817,7 @@ rcu_scale_init(void) long i; int firsterr = 0; static struct rcu_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, &tasks_tracing_ops + &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_TRACING_OPS }; if (!torture_init_begin(scale_type, verbose)) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 55d049c39608..7120165a9342 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -738,6 +738,50 @@ static struct rcu_torture_ops busted_srcud_ops = { }; /* + * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. + * This implementation does not necessarily work well with CPU hotplug. + */ + +static void synchronize_rcu_trivial(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); + WARN_ON_ONCE(raw_smp_processor_id() != cpu); + } +} + +static int rcu_torture_read_lock_trivial(void) __acquires(RCU) +{ + preempt_disable(); + return 0; +} + +static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) +{ + preempt_enable(); +} + +static struct rcu_torture_ops trivial_ops = { + .ttype = RCU_TRIVIAL_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .readlock_held = torture_readlock_not_held, + .get_gp_seq = rcu_no_completed, + .sync = synchronize_rcu_trivial, + .exp_sync = synchronize_rcu_trivial, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "trivial" +}; + +#ifdef CONFIG_TASKS_RCU + +/* * Definitions for RCU-tasks torture testing. */ @@ -780,47 +824,16 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; -/* - * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. - * This implementation does not necessarily work well with CPU hotplug. - */ +#define TASKS_OPS &tasks_ops, -static void synchronize_rcu_trivial(void) -{ - int cpu; +#else // #ifdef CONFIG_TASKS_RCU - for_each_online_cpu(cpu) { - rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); - WARN_ON_ONCE(raw_smp_processor_id() != cpu); - } -} +#define TASKS_OPS -static int rcu_torture_read_lock_trivial(void) __acquires(RCU) -{ - preempt_disable(); - return 0; -} +#endif // #else #ifdef CONFIG_TASKS_RCU -static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) -{ - preempt_enable(); -} -static struct rcu_torture_ops trivial_ops = { - .ttype = RCU_TRIVIAL_FLAVOR, - .init = rcu_sync_torture_init, - .readlock = rcu_torture_read_lock_trivial, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_torture_read_unlock_trivial, - .readlock_held = torture_readlock_not_held, - .get_gp_seq = rcu_no_completed, - .sync = synchronize_rcu_trivial, - .exp_sync = synchronize_rcu_trivial, - .fqs = NULL, - .stats = NULL, - .irq_capable = 1, - .name = "trivial" -}; +#ifdef CONFIG_TASKS_RUDE_RCU /* * Definitions for rude RCU-tasks torture testing. @@ -851,6 +864,17 @@ static struct rcu_torture_ops tasks_rude_ops = { .name = "tasks-rude" }; +#define TASKS_RUDE_OPS &tasks_rude_ops, + +#else // #ifdef CONFIG_TASKS_RUDE_RCU + +#define TASKS_RUDE_OPS + +#endif // #else #ifdef CONFIG_TASKS_RUDE_RCU + + +#ifdef CONFIG_TASKS_TRACE_RCU + /* * Definitions for tracing RCU-tasks torture testing. */ @@ -893,6 +917,15 @@ static struct rcu_torture_ops tasks_tracing_ops = { .name = "tasks-tracing" }; +#define TASKS_TRACING_OPS &tasks_tracing_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define TASKS_TRACING_OPS + +#endif // #else #ifdef CONFIG_TASKS_TRACE_RCU + + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -1178,7 +1211,7 @@ rcu_torture_writer(void *arg) " GP expediting controlled from boot/sysfs for %s.\n", torture_type, cur_ops->name); if (WARN_ONCE(nsynctypes == 0, - "rcu_torture_writer: No update-side primitives.\n")) { + "%s: No update-side primitives.\n", __func__)) { /* * No updates primitives, so don't try updating. * The resulting test won't be testing much, hence the @@ -1186,6 +1219,7 @@ rcu_torture_writer(void *arg) */ rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); + return 0; } do { @@ -1322,6 +1356,17 @@ rcu_torture_fakewriter(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); set_user_nice(current, MAX_NICE); + if (WARN_ONCE(nsynctypes == 0, + "%s: No update-side primitives.\n", __func__)) { + /* + * No updates primitives, so don't try updating. + * The resulting test won't be testing much, hence the + * above WARN_ONCE(). + */ + torture_kthread_stopping("rcu_torture_fakewriter"); + return 0; + } + do { torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && @@ -2916,10 +2961,12 @@ rcu_torture_cleanup(void) pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier); cur_ops->cb_barrier(); } + rcu_gp_slow_unregister(NULL); return; } if (!cur_ops) { torture_cleanup_end(); + rcu_gp_slow_unregister(NULL); return; } @@ -3016,6 +3063,7 @@ rcu_torture_cleanup(void) else rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); torture_cleanup_end(); + rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay); } #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD @@ -3096,9 +3144,9 @@ rcu_torture_init(void) int flags = 0; unsigned long gp_seq = 0; static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, - &tasks_tracing_ops, &trivial_ops, + &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, + TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS + &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) @@ -3320,6 +3368,7 @@ rcu_torture_init(void) if (object_debug) rcu_test_debug_objects(); torture_init_end(); + rcu_gp_slow_register(&rcu_fwd_cb_nodelay); return 0; unwind: diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 5489ff7f478e..909644abee67 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -207,6 +207,8 @@ static struct ref_scale_ops srcu_ops = { .name = "srcu" }; +#ifdef CONFIG_TASKS_RCU + // Definitions for RCU Tasks ref scale testing: Empty read markers. // These definitions also work for RCU Rude readers. static void rcu_tasks_ref_scale_read_section(const int nloops) @@ -232,6 +234,16 @@ static struct ref_scale_ops rcu_tasks_ops = { .name = "rcu-tasks" }; +#define RCU_TASKS_OPS &rcu_tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define RCU_TASKS_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RCU + +#ifdef CONFIG_TASKS_TRACE_RCU + // Definitions for RCU Tasks Trace ref scale testing. static void rcu_trace_ref_scale_read_section(const int nloops) { @@ -261,6 +273,14 @@ static struct ref_scale_ops rcu_trace_ops = { .name = "rcu-trace" }; +#define RCU_TRACE_OPS &rcu_trace_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define RCU_TRACE_OPS + +#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU + // Definitions for reference count static atomic_t refcnt; @@ -790,7 +810,7 @@ ref_scale_init(void) long i; int firsterr = 0; static struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops, + &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, }; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6833d8887181..50ba70f019de 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -24,6 +24,7 @@ #include <linux/smp.h> #include <linux/delay.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/srcu.h> #include "rcu.h" @@ -38,6 +39,35 @@ module_param(exp_holdoff, ulong, 0444); static ulong counter_wrap_check = (ULONG_MAX >> 2); module_param(counter_wrap_check, ulong, 0444); +/* + * Control conversion to SRCU_SIZE_BIG: + * 0: Don't convert at all. + * 1: Convert at init_srcu_struct() time. + * 2: Convert when rcutorture invokes srcu_torture_stats_print(). + * 3: Decide at boot time based on system shape (default). + * 0x1x: Convert when excessive contention encountered. + */ +#define SRCU_SIZING_NONE 0 +#define SRCU_SIZING_INIT 1 +#define SRCU_SIZING_TORTURE 2 +#define SRCU_SIZING_AUTO 3 +#define SRCU_SIZING_CONTEND 0x10 +#define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x) +#define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE)) +#define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT)) +#define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE)) +#define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND) +static int convert_to_big = SRCU_SIZING_AUTO; +module_param(convert_to_big, int, 0444); + +/* Number of CPUs to trigger init_srcu_struct()-time transition to big. */ +static int big_cpu_lim __read_mostly = 128; +module_param(big_cpu_lim, int, 0444); + +/* Contention events per jiffy to initiate transition to big. */ +static int small_contention_lim __read_mostly = 100; +module_param(small_contention_lim, int, 0444); + /* Early-boot callback-management, so early that no lock is required! */ static LIST_HEAD(srcu_boot_list); static bool __read_mostly srcu_init_done; @@ -48,39 +78,90 @@ static void process_srcu(struct work_struct *work); static void srcu_delay_timer(struct timer_list *t); /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ -#define spin_lock_rcu_node(p) \ -do { \ - spin_lock(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_rcu_node(p) \ +do { \ + spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ } while (0) #define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) -#define spin_lock_irq_rcu_node(p) \ -do { \ - spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_irq_rcu_node(p) \ +do { \ + spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ } while (0) -#define spin_unlock_irq_rcu_node(p) \ +#define spin_unlock_irq_rcu_node(p) \ spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) -#define spin_lock_irqsave_rcu_node(p, flags) \ -do { \ - spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_irqsave_rcu_node(p, flags) \ +do { \ + spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + smp_mb__after_unlock_lock(); \ } while (0) -#define spin_unlock_irqrestore_rcu_node(p, flags) \ - spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ +#define spin_trylock_irqsave_rcu_node(p, flags) \ +({ \ + bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) + +#define spin_unlock_irqrestore_rcu_node(p, flags) \ + spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ /* - * Initialize SRCU combining tree. Note that statically allocated + * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and * srcu_read_unlock() running against them. So if the is_static parameter * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. */ -static void init_srcu_struct_nodes(struct srcu_struct *ssp) +static void init_srcu_struct_data(struct srcu_struct *ssp) +{ + int cpu; + struct srcu_data *sdp; + + /* + * Initialize the per-CPU srcu_data array, which feeds into the + * leaves of the srcu_node tree. + */ + WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != + ARRAY_SIZE(sdp->srcu_unlock_count)); + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(ssp->sda, cpu); + spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); + rcu_segcblist_init(&sdp->srcu_cblist); + sdp->srcu_cblist_invoking = false; + sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; + sdp->mynode = NULL; + sdp->cpu = cpu; + INIT_WORK(&sdp->work, srcu_invoke_callbacks); + timer_setup(&sdp->delay_work, srcu_delay_timer, 0); + sdp->ssp = ssp; + } +} + +/* Invalid seq state, used during snp node initialization */ +#define SRCU_SNP_INIT_SEQ 0x2 + +/* + * Check whether sequence number corresponding to snp node, + * is invalid. + */ +static inline bool srcu_invl_snp_seq(unsigned long s) +{ + return rcu_seq_state(s) == SRCU_SNP_INIT_SEQ; +} + +/* + * Allocated and initialize SRCU combining tree. Returns @true if + * allocation succeeded and @false otherwise. + */ +static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) { int cpu; int i; @@ -92,6 +173,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) /* Initialize geometry if it has not already been initialized. */ rcu_init_geometry(); + ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags); + if (!ssp->node) + return false; /* Work out the overall tree geometry. */ ssp->level[0] = &ssp->node[0]; @@ -105,10 +189,10 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { - snp->srcu_have_cbs[i] = 0; + snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ; snp->srcu_data_have_cbs[i] = 0; } - snp->srcu_gp_seq_needed_exp = 0; + snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ; snp->grplo = -1; snp->grphi = -1; if (snp == &ssp->node[0]) { @@ -129,39 +213,31 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != - ARRAY_SIZE(sdp->srcu_unlock_count)); level = rcu_num_lvls - 1; snp_first = ssp->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); - rcu_segcblist_init(&sdp->srcu_cblist); - sdp->srcu_cblist_invoking = false; - sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; - sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; sdp->mynode = &snp_first[cpu / levelspread[level]]; for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { if (snp->grplo < 0) snp->grplo = cpu; snp->grphi = cpu; } - sdp->cpu = cpu; - INIT_WORK(&sdp->work, srcu_invoke_callbacks); - timer_setup(&sdp->delay_work, srcu_delay_timer, 0); - sdp->ssp = ssp; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); } + smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); + return true; } /* * Initialize non-compile-time initialized fields, including the - * associated srcu_node and srcu_data structures. The is_static - * parameter is passed through to init_srcu_struct_nodes(), and - * also tells us that ->sda has already been wired up to srcu_data. + * associated srcu_node and srcu_data structures. The is_static parameter + * tells us that ->sda has already been wired up to srcu_data. */ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) { + ssp->srcu_size_state = SRCU_SIZE_SMALL; + ssp->node = NULL; mutex_init(&ssp->srcu_cb_mutex); mutex_init(&ssp->srcu_gp_mutex); ssp->srcu_idx = 0; @@ -170,13 +246,25 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) mutex_init(&ssp->srcu_barrier_mutex); atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->work, process_srcu); + ssp->sda_is_static = is_static; if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); if (!ssp->sda) return -ENOMEM; - init_srcu_struct_nodes(ssp); + init_srcu_struct_data(ssp); ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { + if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { + if (!ssp->sda_is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + return -ENOMEM; + } + } else { + WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG); + } + } smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ return 0; } @@ -214,6 +302,86 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* + * Initiate a transition to SRCU_SIZE_BIG with lock held. + */ +static void __srcu_transition_to_big(struct srcu_struct *ssp) +{ + lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); + smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC); +} + +/* + * Initiate an idempotent transition to SRCU_SIZE_BIG. + */ +static void srcu_transition_to_big(struct srcu_struct *ssp) +{ + unsigned long flags; + + /* Double-checked locking on ->srcu_size-state. */ + if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) + return; + spin_lock_irqsave_rcu_node(ssp, flags); + if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) { + spin_unlock_irqrestore_rcu_node(ssp, flags); + return; + } + __srcu_transition_to_big(ssp); + spin_unlock_irqrestore_rcu_node(ssp, flags); +} + +/* + * Check to see if the just-encountered contention event justifies + * a transition to SRCU_SIZE_BIG. + */ +static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) +{ + unsigned long j; + + if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state) + return; + j = jiffies; + if (ssp->srcu_size_jiffies != j) { + ssp->srcu_size_jiffies = j; + ssp->srcu_n_lock_retries = 0; + } + if (++ssp->srcu_n_lock_retries <= small_contention_lim) + return; + __srcu_transition_to_big(ssp); +} + +/* + * Acquire the specified srcu_data structure's ->lock, but check for + * excessive contention, which results in initiation of a transition + * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module + * parameter permits this. + */ +static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags) +{ + struct srcu_struct *ssp = sdp->ssp; + + if (spin_trylock_irqsave_rcu_node(sdp, *flags)) + return; + spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_check_contention(ssp); + spin_unlock_irqrestore_rcu_node(ssp, *flags); + spin_lock_irqsave_rcu_node(sdp, *flags); +} + +/* + * Acquire the specified srcu_struct structure's ->lock, but check for + * excessive contention, which results in initiation of a transition + * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module + * parameter permits this. + */ +static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) +{ + if (spin_trylock_irqsave_rcu_node(ssp, *flags)) + return; + spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_check_contention(ssp); +} + +/* * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be * done with compile-time initialization, so this check is added @@ -343,7 +511,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp) return sum; } -#define SRCU_INTERVAL 1 +#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. +#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. +#define SRCU_MAX_NODELAY_PHASE 1 // Maximum per-GP-phase consecutive no-delay instances. +#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances. /* * Return grace-period delay, zero if there are expedited grace @@ -351,10 +522,18 @@ static bool srcu_readers_active(struct srcu_struct *ssp) */ static unsigned long srcu_get_delay(struct srcu_struct *ssp) { - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), - READ_ONCE(ssp->srcu_gp_seq_needed_exp))) - return 0; - return SRCU_INTERVAL; + unsigned long jbase = SRCU_INTERVAL; + + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + jbase = 0; + if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) + jbase += jiffies - READ_ONCE(ssp->srcu_gp_start); + if (!jbase) { + WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); + if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE) + jbase = 1; + } + return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase; } /** @@ -382,13 +561,20 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) return; /* Forgot srcu_barrier(), so just leak it! */ } if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) || WARN_ON(srcu_readers_active(ssp))) { - pr_info("%s: Active srcu_struct %p state: %d\n", - __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))); + pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", + __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)), + rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed); return; /* Caller forgot to stop doing call_srcu()? */ } - free_percpu(ssp->sda); - ssp->sda = NULL; + if (!ssp->sda_is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + } + kfree(ssp->node); + ssp->node = NULL; + ssp->srcu_size_state = SRCU_SIZE_SMALL; } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); @@ -434,9 +620,13 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); */ static void srcu_gp_start(struct srcu_struct *ssp) { - struct srcu_data *sdp = this_cpu_ptr(ssp->sda); + struct srcu_data *sdp; int state; + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + sdp = per_cpu_ptr(ssp->sda, 0); + else + sdp = this_cpu_ptr(ssp->sda); lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ @@ -445,6 +635,8 @@ static void srcu_gp_start(struct srcu_struct *ssp) (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&ssp->srcu_gp_seq)); spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ + WRITE_ONCE(ssp->srcu_gp_start, jiffies); + WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0); smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ rcu_seq_start(&ssp->srcu_gp_seq); state = rcu_seq_state(ssp->srcu_gp_seq); @@ -517,7 +709,9 @@ static void srcu_gp_end(struct srcu_struct *ssp) int idx; unsigned long mask; struct srcu_data *sdp; + unsigned long sgsne; struct srcu_node *snp; + int ss_state; /* Prevent more than one additional grace period. */ mutex_lock(&ssp->srcu_cb_mutex); @@ -526,7 +720,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) spin_lock_irq_rcu_node(ssp); idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - cbdelay = srcu_get_delay(ssp); + cbdelay = !!srcu_get_delay(ssp); WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); @@ -537,38 +731,45 @@ static void srcu_gp_end(struct srcu_struct *ssp) /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ - idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - srcu_for_each_node_breadth_first(ssp, snp) { - spin_lock_irq_rcu_node(snp); - cbs = false; - last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; - if (last_lvl) - cbs = snp->srcu_have_cbs[idx] == gpseq; - snp->srcu_have_cbs[idx] = gpseq; - rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); - if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); - mask = snp->srcu_data_have_cbs[idx]; - snp->srcu_data_have_cbs[idx] = 0; - spin_unlock_irq_rcu_node(snp); - if (cbs) - srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); - - /* Occasionally prevent srcu_data counter wrap. */ - if (!(gpseq & counter_wrap_check) && last_lvl) - for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irqsave_rcu_node(sdp, flags); - if (ULONG_CMP_GE(gpseq, - sdp->srcu_gp_seq_needed + 100)) - sdp->srcu_gp_seq_needed = gpseq; - if (ULONG_CMP_GE(gpseq, - sdp->srcu_gp_seq_needed_exp + 100)) - sdp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irqrestore_rcu_node(sdp, flags); - } + ss_state = smp_load_acquire(&ssp->srcu_size_state); + if (ss_state < SRCU_SIZE_WAIT_BARRIER) { + srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay); + } else { + idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); + srcu_for_each_node_breadth_first(ssp, snp) { + spin_lock_irq_rcu_node(snp); + cbs = false; + last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; + if (last_lvl) + cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq; + snp->srcu_have_cbs[idx] = gpseq; + rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); + sgsne = snp->srcu_gp_seq_needed_exp; + if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq)) + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); + if (ss_state < SRCU_SIZE_BIG) + mask = ~0; + else + mask = snp->srcu_data_have_cbs[idx]; + snp->srcu_data_have_cbs[idx] = 0; + spin_unlock_irq_rcu_node(snp); + if (cbs) + srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); + } } + /* Occasionally prevent srcu_data counter wrap. */ + if (!(gpseq & counter_wrap_check)) + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(ssp->sda, cpu); + spin_lock_irqsave_rcu_node(sdp, flags); + if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) + sdp->srcu_gp_seq_needed = gpseq; + if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) + sdp->srcu_gp_seq_needed_exp = gpseq; + spin_unlock_irqrestore_rcu_node(sdp, flags); + } + /* Callback initiation done, allow grace periods after next. */ mutex_unlock(&ssp->srcu_cb_mutex); @@ -583,6 +784,14 @@ static void srcu_gp_end(struct srcu_struct *ssp) } else { spin_unlock_irq_rcu_node(ssp); } + + /* Transition to big if needed. */ + if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) { + if (ss_state == SRCU_SIZE_ALLOC) + init_srcu_struct_nodes(ssp, GFP_KERNEL); + else + smp_store_release(&ssp->srcu_size_state, ss_state + 1); + } } /* @@ -596,20 +805,24 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp unsigned long s) { unsigned long flags; + unsigned long sgsne; - for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) || - ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) - return; - spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { + if (snp) + for (; snp != NULL; snp = snp->srcu_parent) { + sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp); + if (rcu_seq_done(&ssp->srcu_gp_seq, s) || + (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) + return; + spin_lock_irqsave_rcu_node(snp, flags); + sgsne = snp->srcu_gp_seq_needed_exp; + if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) { + spin_unlock_irqrestore_rcu_node(snp, flags); + return; + } + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); - return; } - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); - } - spin_lock_irqsave_rcu_node(ssp, flags); + spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(ssp, flags); @@ -630,39 +843,47 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, { unsigned long flags; int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); - struct srcu_node *snp = sdp->mynode; + unsigned long sgsne; + struct srcu_node *snp; + struct srcu_node *snp_leaf; unsigned long snp_seq; - /* Each pass through the loop does one level of the srcu_node tree. */ - for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode) - return; /* GP already done and CBs recorded. */ - spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { + /* Ensure that snp node tree is fully initialized before traversing it */ + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + snp_leaf = NULL; + else + snp_leaf = sdp->mynode; + + if (snp_leaf) + /* Each pass through the loop does one level of the srcu_node tree. */ + for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf) + return; /* GP already done and CBs recorded. */ + spin_lock_irqsave_rcu_node(snp, flags); snp_seq = snp->srcu_have_cbs[idx]; - if (snp == sdp->mynode && snp_seq == s) - snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - spin_unlock_irqrestore_rcu_node(snp, flags); - if (snp == sdp->mynode && snp_seq != s) { - srcu_schedule_cbs_sdp(sdp, do_norm - ? SRCU_INTERVAL - : 0); + if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) { + if (snp == snp_leaf && snp_seq == s) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + spin_unlock_irqrestore_rcu_node(snp, flags); + if (snp == snp_leaf && snp_seq != s) { + srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); + return; + } + if (!do_norm) + srcu_funnel_exp_start(ssp, snp, s); return; } - if (!do_norm) - srcu_funnel_exp_start(ssp, snp, s); - return; + snp->srcu_have_cbs[idx] = s; + if (snp == snp_leaf) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + sgsne = snp->srcu_gp_seq_needed_exp; + if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s))) + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); + spin_unlock_irqrestore_rcu_node(snp, flags); } - snp->srcu_have_cbs[idx] = s; - if (snp == sdp->mynode) - snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); - } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave_rcu_node(ssp, flags); + spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -678,9 +899,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); srcu_gp_start(ssp); + + // And how can that list_add() in the "else" clause + // possibly be safe for concurrent execution? Well, + // it isn't. And it does not have to be. After all, it + // can only be executed during early boot when there is only + // the one boot CPU running with interrupts still disabled. if (likely(srcu_init_done)) queue_delayed_work(rcu_gp_wq, &ssp->work, - srcu_get_delay(ssp)); + !!srcu_get_delay(ssp)); else if (list_empty(&ssp->work.work.entry)) list_add(&ssp->work.work.entry, &srcu_boot_list); } @@ -814,11 +1041,17 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, bool needgp = false; unsigned long s; struct srcu_data *sdp; + struct srcu_node *sdp_mynode; + int ss_state; check_init_srcu_struct(ssp); idx = srcu_read_lock(ssp); - sdp = raw_cpu_ptr(ssp->sda); - spin_lock_irqsave_rcu_node(sdp, flags); + ss_state = smp_load_acquire(&ssp->srcu_size_state); + if (ss_state < SRCU_SIZE_WAIT_CALL) + sdp = per_cpu_ptr(ssp->sda, 0); + else + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_sdp_contention(sdp, &flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, @@ -834,10 +1067,17 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, needexp = true; } spin_unlock_irqrestore_rcu_node(sdp, flags); + + /* Ensure that snp node tree is fully initialized before traversing it */ + if (ss_state < SRCU_SIZE_WAIT_BARRIER) + sdp_mynode = NULL; + else + sdp_mynode = sdp->mynode; + if (needgp) srcu_funnel_gp_start(ssp, sdp, s, do_norm); else if (needexp) - srcu_funnel_exp_start(ssp, sdp->mynode, s); + srcu_funnel_exp_start(ssp, sdp_mynode, s); srcu_read_unlock(ssp, idx); return s; } @@ -1097,6 +1337,28 @@ static void srcu_barrier_cb(struct rcu_head *rhp) complete(&ssp->srcu_barrier_completion); } +/* + * Enqueue an srcu_barrier() callback on the specified srcu_data + * structure's ->cblist. but only if that ->cblist already has at least one + * callback enqueued. Note that if a CPU already has callbacks enqueue, + * it must have already registered the need for a future grace period, + * so all we need do is enqueue a callback that will use the same grace + * period as the last callback already in the queue. + */ +static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) +{ + spin_lock_irq_rcu_node(sdp); + atomic_inc(&ssp->srcu_barrier_cpu_cnt); + sdp->srcu_barrier_head.func = srcu_barrier_cb; + debug_rcu_head_queue(&sdp->srcu_barrier_head); + if (!rcu_segcblist_entrain(&sdp->srcu_cblist, + &sdp->srcu_barrier_head)) { + debug_rcu_head_unqueue(&sdp->srcu_barrier_head); + atomic_dec(&ssp->srcu_barrier_cpu_cnt); + } + spin_unlock_irq_rcu_node(sdp); +} + /** * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. * @ssp: srcu_struct on which to wait for in-flight callbacks. @@ -1104,7 +1366,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp) void srcu_barrier(struct srcu_struct *ssp) { int cpu; - struct srcu_data *sdp; + int idx; unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); check_init_srcu_struct(ssp); @@ -1120,27 +1382,13 @@ void srcu_barrier(struct srcu_struct *ssp) /* Initial count prevents reaching zero until all CBs are posted. */ atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); - /* - * Each pass through this loop enqueues a callback, but only - * on CPUs already having callbacks enqueued. Note that if - * a CPU already has callbacks enqueue, it must have already - * registered the need for a future grace period, so all we - * need do is enqueue a callback that will use the same - * grace period as the last callback already in the queue. - */ - for_each_possible_cpu(cpu) { - sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irq_rcu_node(sdp); - atomic_inc(&ssp->srcu_barrier_cpu_cnt); - sdp->srcu_barrier_head.func = srcu_barrier_cb; - debug_rcu_head_queue(&sdp->srcu_barrier_head); - if (!rcu_segcblist_entrain(&sdp->srcu_cblist, - &sdp->srcu_barrier_head)) { - debug_rcu_head_unqueue(&sdp->srcu_barrier_head); - atomic_dec(&ssp->srcu_barrier_cpu_cnt); - } - spin_unlock_irq_rcu_node(sdp); - } + idx = srcu_read_lock(ssp); + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0)); + else + for_each_possible_cpu(cpu) + srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu)); + srcu_read_unlock(ssp, idx); /* Remove the initial count, at which point reaching zero can happen. */ if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) @@ -1214,6 +1462,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) srcu_flip(ssp); spin_lock_irq_rcu_node(ssp); rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); + ssp->srcu_n_exp_nodelay = 0; spin_unlock_irq_rcu_node(ssp); } @@ -1228,6 +1477,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) mutex_unlock(&ssp->srcu_gp_mutex); return; /* readers present, retry later. */ } + ssp->srcu_n_exp_nodelay = 0; srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ } } @@ -1318,12 +1568,28 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) */ static void process_srcu(struct work_struct *work) { + unsigned long curdelay; + unsigned long j; struct srcu_struct *ssp; ssp = container_of(work, struct srcu_struct, work.work); srcu_advance_state(ssp); - srcu_reschedule(ssp, srcu_get_delay(ssp)); + curdelay = srcu_get_delay(ssp); + if (curdelay) { + WRITE_ONCE(ssp->reschedule_count, 0); + } else { + j = jiffies; + if (READ_ONCE(ssp->reschedule_jiffies) == j) { + WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1); + if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY) + curdelay = 1; + } else { + WRITE_ONCE(ssp->reschedule_count, 1); + WRITE_ONCE(ssp->reschedule_jiffies, j); + } + } + srcu_reschedule(ssp, curdelay); } void srcutorture_get_gp_data(enum rcutorture_type test_type, @@ -1337,43 +1603,69 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); +static const char * const srcu_size_state_name[] = { + "SRCU_SIZE_SMALL", + "SRCU_SIZE_ALLOC", + "SRCU_SIZE_WAIT_BARRIER", + "SRCU_SIZE_WAIT_CALL", + "SRCU_SIZE_WAIT_CBS1", + "SRCU_SIZE_WAIT_CBS2", + "SRCU_SIZE_WAIT_CBS3", + "SRCU_SIZE_WAIT_CBS4", + "SRCU_SIZE_BIG", + "SRCU_SIZE_???", +}; + void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) { int cpu; int idx; unsigned long s0 = 0, s1 = 0; + int ss_state = READ_ONCE(ssp->srcu_size_state); + int ss_state_idx = ss_state; idx = ssp->srcu_idx & 0x1; - pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):", - tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx); - for_each_possible_cpu(cpu) { - unsigned long l0, l1; - unsigned long u0, u1; - long c0, c1; - struct srcu_data *sdp; - - sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = data_race(sdp->srcu_unlock_count[!idx]); - u1 = data_race(sdp->srcu_unlock_count[idx]); - - /* - * Make sure that a lock is always counted if the corresponding - * unlock is counted. - */ - smp_rmb(); - - l0 = data_race(sdp->srcu_lock_count[!idx]); - l1 = data_race(sdp->srcu_lock_count[idx]); - - c0 = l0 - u0; - c1 = l1 - u1; - pr_cont(" %d(%ld,%ld %c)", - cpu, c0, c1, - "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); - s0 += c0; - s1 += c1; + if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) + ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; + pr_alert("%s%s Tree SRCU g%ld state %d (%s)", + tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state, + srcu_size_state_name[ss_state_idx]); + if (!ssp->sda) { + // Called after cleanup_srcu_struct(), perhaps. + pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n"); + } else { + pr_cont(" per-CPU(idx=%d):", idx); + for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; + long c0, c1; + struct srcu_data *sdp; + + sdp = per_cpu_ptr(ssp->sda, cpu); + u0 = data_race(sdp->srcu_unlock_count[!idx]); + u1 = data_race(sdp->srcu_unlock_count[idx]); + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); + + l0 = data_race(sdp->srcu_lock_count[!idx]); + l1 = data_race(sdp->srcu_lock_count[idx]); + + c0 = l0 - u0; + c1 = l1 - u1; + pr_cont(" %d(%ld,%ld %c)", + cpu, c0, c1, + "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); + s0 += c0; + s1 += c1; + } + pr_cont(" T(%ld,%ld)\n", s0, s1); } - pr_cont(" T(%ld,%ld)\n", s0, s1); + if (SRCU_SIZING_IS_TORTURE()) + srcu_transition_to_big(ssp); } EXPORT_SYMBOL_GPL(srcu_torture_stats_print); @@ -1390,6 +1682,17 @@ void __init srcu_init(void) { struct srcu_struct *ssp; + /* Decide on srcu_struct-size strategy. */ + if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) { + if (nr_cpu_ids >= big_cpu_lim) { + convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention. + pr_info("%s: Setting srcu_struct sizes to big.\n", __func__); + } else { + convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND; + pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__); + } + } + /* * Once that is set, call_srcu() can follow the normal path and * queue delayed work. This must follow RCU workqueues creation @@ -1400,6 +1703,8 @@ void __init srcu_init(void) ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, work.work.entry); list_del_init(&ssp->work.work.entry); + if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL) + ssp->srcu_size_state = SRCU_SIZE_ALLOC; queue_work(rcu_gp_wq, &ssp->work.work); } } diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 33d896d85902..5cefc702158f 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -111,7 +111,7 @@ static void rcu_sync_func(struct rcu_head *rhp) * a slowpath during the update. After this function returns, all * subsequent calls to rcu_sync_is_idle() will return false, which * tells readers to stay off their fastpaths. A later call to - * rcu_sync_exit() re-enables reader slowpaths. + * rcu_sync_exit() re-enables reader fastpaths. * * When called in isolation, rcu_sync_enter() must wait for a grace * period, however, closely spaced calls to rcu_sync_enter() can diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 99cf3a13954c..3925e32159b5 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -46,7 +46,7 @@ struct rcu_tasks_percpu { /** * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. - * @cbs_wq: Wait queue allowing new callback to get kthread's attention. + * @cbs_wait: RCU wait allowing a new callback to get kthread's attention. * @cbs_gbl_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. @@ -77,7 +77,7 @@ struct rcu_tasks_percpu { * @kname: This flavor's kthread name. */ struct rcu_tasks { - struct wait_queue_head cbs_wq; + struct rcuwait cbs_wait; raw_spinlock_t cbs_gbl_lock; int gp_state; int gp_sleep; @@ -113,11 +113,11 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp); #define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \ .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \ - .rtp_irq_work = IRQ_WORK_INIT(call_rcu_tasks_iw_wakeup), \ + .rtp_irq_work = IRQ_WORK_INIT_HARD(call_rcu_tasks_iw_wakeup), \ }; \ static struct rcu_tasks rt_name = \ { \ - .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ + .cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait), \ .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \ .gp_func = gp, \ .call_func = call, \ @@ -143,6 +143,11 @@ module_param(rcu_task_ipi_delay, int, 0644); #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); +#define RCU_TASK_STALL_INFO (HZ * 10) +static int rcu_task_stall_info __read_mostly = RCU_TASK_STALL_INFO; +module_param(rcu_task_stall_info, int, 0644); +static int rcu_task_stall_info_mult __read_mostly = 3; +module_param(rcu_task_stall_info_mult, int, 0444); static int rcu_task_enqueue_lim __read_mostly = -1; module_param(rcu_task_enqueue_lim, int, 0444); @@ -261,14 +266,16 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp) struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work); rtp = rtpcp->rtpp; - wake_up(&rtp->cbs_wq); + rcuwait_wake_up(&rtp->cbs_wait); } // Enqueue a callback for the specified flavor of Tasks RCU. static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, struct rcu_tasks *rtp) { + int chosen_cpu; unsigned long flags; + int ideal_cpu; unsigned long j; bool needadjust = false; bool needwake; @@ -278,8 +285,9 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rhp->func = func; local_irq_save(flags); rcu_read_lock(); - rtpcp = per_cpu_ptr(rtp->rtpcpu, - smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift)); + ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift); + chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask); + rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu); if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. j = jiffies; @@ -460,7 +468,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu } } - if (rcu_segcblist_empty(&rtpcp->cblist)) + if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu)) return; raw_spin_lock_irqsave_rcu_node(rtpcp, flags); rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); @@ -509,7 +517,9 @@ static int __noreturn rcu_tasks_kthread(void *arg) set_tasks_gp_state(rtp, RTGS_WAIT_CBS); /* If there were none, wait a bit and start over. */ - wait_event_idle(rtp->cbs_wq, (needgpcb = rcu_tasks_need_gpcb(rtp))); + rcuwait_wait_event(&rtp->cbs_wait, + (needgpcb = rcu_tasks_need_gpcb(rtp)), + TASK_IDLE); if (needgpcb & 0x2) { // Wait for one grace period. @@ -548,8 +558,15 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) static void __init rcu_tasks_bootup_oddness(void) { #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + int rtsimc; + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); + rtsimc = clamp(rcu_task_stall_info_mult, 1, 10); + if (rtsimc != rcu_task_stall_info_mult) { + pr_info("\tTasks-RCU CPU stall info multiplier clamped to %d (rcu_task_stall_info_mult).\n", rtsimc); + rcu_task_stall_info_mult = rtsimc; + } #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RCU pr_info("\tTrampoline variant of Tasks RCU enabled.\n"); @@ -568,7 +585,17 @@ static void __init rcu_tasks_bootup_oddness(void) /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { - struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); // for_each... + int cpu; + bool havecbs = false; + + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) { + havecbs = true; + break; + } + } pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", rtp->kname, tasks_gp_state_getname(rtp), data_race(rtp->gp_state), @@ -576,7 +603,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) data_race(rcu_seq_current(&rtp->tasks_gp_seq)), data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], - ".C"[!data_race(rcu_segcblist_empty(&rtpcp->cblist))], + ".C"[havecbs], s); } #endif // #ifndef CONFIG_TINY_RCU @@ -592,10 +619,15 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t); /* Wait for one RCU-tasks grace period. */ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) { - struct task_struct *g, *t; - unsigned long lastreport; - LIST_HEAD(holdouts); + struct task_struct *g; int fract; + LIST_HEAD(holdouts); + unsigned long j; + unsigned long lastinfo; + unsigned long lastreport; + bool reported = false; + int rtsi; + struct task_struct *t; set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP); rtp->pregp_func(); @@ -621,30 +653,50 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) * is empty, we are done. */ lastreport = jiffies; + lastinfo = lastreport; + rtsi = READ_ONCE(rcu_task_stall_info); // Start off with initial wait and slowly back off to 1 HZ wait. fract = rtp->init_fract; while (!list_empty(&holdouts)) { + ktime_t exp; bool firstreport; bool needreport; int rtst; - /* Slowly back off waiting for holdouts */ + // Slowly back off waiting for holdouts set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); - schedule_timeout_idle(fract); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + schedule_timeout_idle(fract); + } else { + exp = jiffies_to_nsecs(fract); + __set_current_state(TASK_IDLE); + schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD); + } if (fract < HZ) fract++; rtst = READ_ONCE(rcu_task_stall_timeout); needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); - if (needreport) + if (needreport) { lastreport = jiffies; + reported = true; + } firstreport = true; WARN_ON(signal_pending(current)); set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS); rtp->holdouts_func(&holdouts, needreport, &firstreport); + + // Print pre-stall informational messages if needed. + j = jiffies; + if (rtsi > 0 && !reported && time_after(j, lastinfo + rtsi)) { + lastinfo = j; + rtsi = rtsi * rcu_task_stall_info_mult; + pr_info("%s: %s grace period %lu is %lu jiffies old.\n", + __func__, rtp->kname, rtp->tasks_gp_seq, j - rtp->gp_start); + } } set_tasks_gp_state(rtp, RTGS_POST_GP); @@ -950,6 +1002,9 @@ static void rcu_tasks_be_rude(struct work_struct *work) // Wait for one rude RCU-tasks grace period. static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) { + if (num_online_cpus() <= 1) + return; // Fastpath for only one CPU. + rtp->n_ipis += cpumask_weight(cpu_online_mask); schedule_on_each_cpu(rcu_tasks_be_rude); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4b8189455d5..c25ba442044a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1679,6 +1679,8 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); + if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap)) + WRITE_ONCE(rdp->last_sched_clock, jiffies); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); return ret; @@ -1705,11 +1707,37 @@ static void note_gp_changes(struct rcu_data *rdp) rcu_gp_kthread_wake(); } +static atomic_t *rcu_gp_slow_suppress; + +/* Register a counter to suppress debugging grace-period delays. */ +void rcu_gp_slow_register(atomic_t *rgssp) +{ + WARN_ON_ONCE(rcu_gp_slow_suppress); + + WRITE_ONCE(rcu_gp_slow_suppress, rgssp); +} +EXPORT_SYMBOL_GPL(rcu_gp_slow_register); + +/* Unregister a counter, with NULL for not caring which. */ +void rcu_gp_slow_unregister(atomic_t *rgssp) +{ + WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress); + + WRITE_ONCE(rcu_gp_slow_suppress, NULL); +} +EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister); + +static bool rcu_gp_slow_is_suppressed(void) +{ + atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress); + + return rgssp && atomic_read(rgssp); +} + static void rcu_gp_slow(int delay) { - if (delay > 0 && - !(rcu_seq_ctr(rcu_state.gp_seq) % - (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) + if (!rcu_gp_slow_is_suppressed() && delay > 0 && + !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) schedule_timeout_idle(delay); } @@ -2096,14 +2124,29 @@ static noinline void rcu_gp_cleanup(void) /* Advance CBs to reduce false positives below. */ offloaded = rcu_rdp_is_offloaded(rdp); if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { + + // We get here if a grace period was needed (“needgp”) + // and the above call to rcu_accelerate_cbs() did not set + // the RCU_GP_FLAG_INIT bit in ->gp_state (which records + // the need for another grace period). The purpose + // of the “offloaded” check is to avoid invoking + // rcu_accelerate_cbs() on an offloaded CPU because we do not + // hold the ->nocb_lock needed to safely access an offloaded + // ->cblist. We do not want to acquire that lock because + // it can be heavily contended during callback floods. + WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); WRITE_ONCE(rcu_state.gp_req_activity, jiffies); - trace_rcu_grace_period(rcu_state.name, - rcu_state.gp_seq, - TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); } else { - WRITE_ONCE(rcu_state.gp_flags, - rcu_state.gp_flags & RCU_GP_FLAG_INIT); + + // We get here either if there is no need for an + // additional grace period or if rcu_accelerate_cbs() has + // already set the RCU_GP_FLAG_INIT bit in ->gp_flags. + // So all we need to do is to clear all of the other + // ->gp_flags bits. + + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT); } raw_spin_unlock_irq_rcu_node(rnp); @@ -2609,6 +2652,13 @@ static void rcu_do_batch(struct rcu_data *rdp) */ void rcu_sched_clock_irq(int user) { + unsigned long j; + + if (IS_ENABLED(CONFIG_PROVE_RCU)) { + j = jiffies; + WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock))); + __this_cpu_write(rcu_data.last_sched_clock, j); + } trace_rcu_utilization(TPS("Start scheduler-tick")); lockdep_assert_irqs_disabled(); raw_cpu_inc(rcu_data.ticks_this_gp); @@ -2624,6 +2674,8 @@ void rcu_sched_clock_irq(int user) rcu_flavor_sched_clock_irq(user); if (rcu_pending(user)) invoke_rcu_core(); + if (user) + rcu_tasks_classic_qs(current, false); lockdep_assert_irqs_disabled(); trace_rcu_utilization(TPS("End scheduler-tick")); @@ -3717,7 +3769,9 @@ static int rcu_blocking_is_gp(void) { int ret; - if (IS_ENABLED(CONFIG_PREEMPTION)) + // Invoking preempt_model_*() too early gets a splat. + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE || + preempt_model_full() || preempt_model_rt()) return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; might_sleep(); /* Check for RCU read-side critical section. */ preempt_disable(); @@ -4179,6 +4233,7 @@ rcu_boot_init_percpu_data(int cpu) rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; rdp->rcu_onl_gp_seq = rcu_state.gp_seq; rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; + rdp->last_sched_clock = jiffies; rdp->cpu = cpu; rcu_boot_init_nocb_percpu_data(rdp); } @@ -4471,6 +4526,51 @@ static int rcu_pm_notify(struct notifier_block *self, return NOTIFY_OK; } +#ifdef CONFIG_RCU_EXP_KTHREAD +struct kthread_worker *rcu_exp_gp_kworker; +struct kthread_worker *rcu_exp_par_gp_kworker; + +static void __init rcu_start_exp_gp_kworkers(void) +{ + const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker"; + const char *gp_kworker_name = "rcu_exp_gp_kthread_worker"; + struct sched_param param = { .sched_priority = kthread_prio }; + + rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { + pr_err("Failed to create %s!\n", gp_kworker_name); + return; + } + + rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { + pr_err("Failed to create %s!\n", par_gp_kworker_name); + kthread_destroy_worker(rcu_exp_gp_kworker); + return; + } + + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, + ¶m); +} + +static inline void rcu_alloc_par_gp_wq(void) +{ +} +#else /* !CONFIG_RCU_EXP_KTHREAD */ +struct workqueue_struct *rcu_par_gp_wq; + +static void __init rcu_start_exp_gp_kworkers(void) +{ +} + +static inline void rcu_alloc_par_gp_wq(void) +{ + rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_par_gp_wq); +} +#endif /* CONFIG_RCU_EXP_KTHREAD */ + /* * Spawn the kthreads that handle RCU's grace periods. */ @@ -4480,6 +4580,7 @@ static int __init rcu_spawn_gp_kthread(void) struct rcu_node *rnp; struct sched_param sp; struct task_struct *t; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); rcu_scheduler_fully_active = 1; t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); @@ -4497,9 +4598,17 @@ static int __init rcu_spawn_gp_kthread(void) smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); - rcu_spawn_nocb_kthreads(); - rcu_spawn_boost_kthreads(); + /* This is a pre-SMP initcall, we expect a single CPU */ + WARN_ON(num_online_cpus() > 1); + /* + * Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu() + * due to rcu_scheduler_fully_active. + */ + rcu_spawn_cpu_nocb_kthread(smp_processor_id()); + rcu_spawn_one_boost_kthread(rdp->mynode); rcu_spawn_core_kthreads(); + /* Create kthread worker for expedited GPs */ + rcu_start_exp_gp_kworkers(); return 0; } early_initcall(rcu_spawn_gp_kthread); @@ -4745,7 +4854,6 @@ static void __init rcu_dump_rcu_node_tree(void) } struct workqueue_struct *rcu_gp_wq; -struct workqueue_struct *rcu_par_gp_wq; static void __init kfree_rcu_batch_init(void) { @@ -4782,7 +4890,7 @@ static void __init kfree_rcu_batch_init(void) void __init rcu_init(void) { - int cpu; + int cpu = smp_processor_id(); rcu_early_boot_tests(); @@ -4802,17 +4910,15 @@ void __init rcu_init(void) * or the scheduler are operational. */ pm_notifier(rcu_pm_notify, 0); - for_each_online_cpu(cpu) { - rcutree_prepare_cpu(cpu); - rcu_cpu_starting(cpu); - rcutree_online_cpu(cpu); - } + WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot. + rcutree_prepare_cpu(cpu); + rcu_cpu_starting(cpu); + rcutree_online_cpu(cpu); /* Create workqueue for Tree SRCU and for expedited GPs. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); - rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); - WARN_ON(!rcu_par_gp_wq); + rcu_alloc_par_gp_wq(); /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 926673ebe355..2ccf5845957d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -10,6 +10,7 @@ */ #include <linux/cache.h> +#include <linux/kthread.h> #include <linux/spinlock.h> #include <linux/rtmutex.h> #include <linux/threads.h> @@ -23,7 +24,11 @@ /* Communicate arguments to a workqueue handler. */ struct rcu_exp_work { unsigned long rew_s; +#ifdef CONFIG_RCU_EXP_KTHREAD + struct kthread_work rew_work; +#else struct work_struct rew_work; +#endif /* CONFIG_RCU_EXP_KTHREAD */ }; /* RCU's kthread states for tracing. */ @@ -254,6 +259,7 @@ struct rcu_data { unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */ short rcu_onl_gp_flags; /* ->gp_flags at last online. */ unsigned long last_fqs_resched; /* Time of last rcu_resched(). */ + unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */ int cpu; }; @@ -364,6 +370,7 @@ struct rcu_state { arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; /* Synchronize offline with */ /* GP pre-initialization. */ + int nocb_is_setup; /* nocb is setup from boot */ }; /* Values for rcu_state structure's gp_flags field. */ @@ -421,7 +428,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static bool rcu_is_callbacks_kthread(void); static void rcu_cpu_kthread_setup(unsigned int cpu); static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); -static void __init rcu_spawn_boost_kthreads(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); @@ -439,7 +445,6 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level); static bool do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_cpu_nocb_kthread(int cpu); -static void __init rcu_spawn_nocb_kthreads(void); static void show_rcu_nocb_state(struct rcu_data *rdp); static void rcu_nocb_lock(struct rcu_data *rdp); static void rcu_nocb_unlock(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 60197ea24ceb..0f70f62039a9 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -334,15 +334,13 @@ fastpath: * Select the CPUs within the specified rcu_node that the upcoming * expedited grace period needs to wait for. */ -static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) +static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) { int cpu; unsigned long flags; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; - struct rcu_exp_work *rewp = - container_of(wp, struct rcu_exp_work, rew_work); struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -417,13 +415,119 @@ retry_ipi: rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); } +static void rcu_exp_sel_wait_wake(unsigned long s); + +#ifdef CONFIG_RCU_EXP_KTHREAD +static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) +{ + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + + __sync_rcu_exp_select_node_cpus(rewp); +} + +static inline bool rcu_gp_par_worker_started(void) +{ + return !!READ_ONCE(rcu_exp_par_gp_kworker); +} + +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) +{ + kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + /* + * Use rcu_exp_par_gp_kworker, because flushing a work item from + * another work item on the same kthread worker can result in + * deadlock. + */ + kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work); +} + +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) +{ + kthread_flush_work(&rnp->rew.rew_work); +} + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct kthread_work *wp) +{ + struct rcu_exp_work *rewp; + + rewp = container_of(wp, struct rcu_exp_work, rew_work); + rcu_exp_sel_wait_wake(rewp->rew_s); +} + +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) +{ + kthread_init_work(&rew->rew_work, wait_rcu_exp_gp); + kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work); +} + +static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) +{ +} +#else /* !CONFIG_RCU_EXP_KTHREAD */ +static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) +{ + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + + __sync_rcu_exp_select_node_cpus(rewp); +} + +static inline bool rcu_gp_par_worker_started(void) +{ + return !!READ_ONCE(rcu_par_gp_wq); +} + +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) +{ + int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); + + INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + /* If all offline, queue the work on an unbound CPU. */ + if (unlikely(cpu > rnp->grphi - rnp->grplo)) + cpu = WORK_CPU_UNBOUND; + else + cpu += rnp->grplo; + queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); +} + +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) +{ + flush_work(&rnp->rew.rew_work); +} + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct work_struct *wp) +{ + struct rcu_exp_work *rewp; + + rewp = container_of(wp, struct rcu_exp_work, rew_work); + rcu_exp_sel_wait_wake(rewp->rew_s); +} + +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) +{ + INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp); + queue_work(rcu_gp_wq, &rew->rew_work); +} + +static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) +{ + destroy_work_on_stack(&rew->rew_work); +} +#endif /* CONFIG_RCU_EXP_KTHREAD */ + /* * Select the nodes that the upcoming expedited grace period needs * to wait for. */ static void sync_rcu_exp_select_cpus(void) { - int cpu; struct rcu_node *rnp; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset")); @@ -435,28 +539,21 @@ static void sync_rcu_exp_select_cpus(void) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - if (!READ_ONCE(rcu_par_gp_wq) || + if (!rcu_gp_par_worker_started() || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { - /* No workqueues yet or last leaf, do direct call. */ + /* No worker started yet or last leaf, do direct call. */ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); continue; } - INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); - /* If all offline, queue the work on an unbound CPU. */ - if (unlikely(cpu > rnp->grphi - rnp->grplo)) - cpu = WORK_CPU_UNBOUND; - else - cpu += rnp->grplo; - queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); + sync_rcu_exp_select_cpus_queue_work(rnp); rnp->exp_need_flush = true; } - /* Wait for workqueue jobs (if any) to complete. */ + /* Wait for jobs (if any) to complete. */ rcu_for_each_leaf_node(rnp) if (rnp->exp_need_flush) - flush_work(&rnp->rew.rew_work); + sync_rcu_exp_select_cpus_flush_work(rnp); } /* @@ -496,7 +593,7 @@ static void synchronize_rcu_expedited_wait(void) struct rcu_node *rnp_root = rcu_get_root(); trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); - jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_stall = rcu_exp_jiffies_till_stall_check(); jiffies_start = jiffies; if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) { if (synchronize_rcu_expedited_wait_once(1)) @@ -571,7 +668,7 @@ static void synchronize_rcu_expedited_wait(void) dump_cpu_task(cpu); } } - jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; } } @@ -622,17 +719,6 @@ static void rcu_exp_sel_wait_wake(unsigned long s) rcu_exp_wait_wake(s); } -/* - * Work-queue handler to drive an expedited grace period forward. - */ -static void wait_rcu_exp_gp(struct work_struct *wp) -{ - struct rcu_exp_work *rewp; - - rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_s); -} - #ifdef CONFIG_PREEMPT_RCU /* @@ -848,20 +934,19 @@ void synchronize_rcu_expedited(void) } else { /* Marshall arguments & schedule the expedited grace period. */ rew.rew_s = s; - INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew.rew_work); + synchronize_rcu_expedited_queue_work(&rew); } /* Wait for expedited grace period to complete. */ rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); - smp_mb(); /* Workqueue actions happen before return. */ + smp_mb(); /* Work actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); if (likely(!boottime)) - destroy_work_on_stack(&rew.rew_work); + synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 636d0546a4e9..46694e13398a 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -60,9 +60,6 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. * If the list is invalid, a warning is emitted and all CPUs are offloaded. */ - -static bool rcu_nocb_is_setup; - static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); @@ -72,7 +69,7 @@ static int __init rcu_nocb_setup(char *str) cpumask_setall(rcu_nocb_mask); } } - rcu_nocb_is_setup = true; + rcu_state.nocb_is_setup = true; return 1; } __setup("rcu_nocbs", rcu_nocb_setup); @@ -215,14 +212,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_swait_queue_head(&rnp->nocb_gp_wq[1]); } -/* Is the specified CPU a no-CBs CPU? */ -bool rcu_is_nocb_cpu(int cpu) -{ - if (cpumask_available(rcu_nocb_mask)) - return cpumask_test_cpu(cpu, rcu_nocb_mask); - return false; -} - static bool __wake_nocb_gp(struct rcu_data *rdp_gp, struct rcu_data *rdp, bool force, unsigned long flags) @@ -1180,10 +1169,10 @@ void __init rcu_init_nohz(void) return; } } - rcu_nocb_is_setup = true; + rcu_state.nocb_is_setup = true; } - if (!rcu_nocb_is_setup) + if (!rcu_state.nocb_is_setup) return; #if defined(CONFIG_NO_HZ_FULL) @@ -1241,7 +1230,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) struct task_struct *t; struct sched_param sp; - if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup) + if (!rcu_scheduler_fully_active || !rcu_state.nocb_is_setup) return; /* If there already is an rcuo kthread, then nothing to do. */ @@ -1277,22 +1266,6 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); } -/* - * Once the scheduler is running, spawn rcuo kthreads for all online - * no-CBs CPUs. This assumes that the early_initcall()s happen before - * non-boot CPUs come online -- if this changes, we will need to add - * some mutual exclusion. - */ -static void __init rcu_spawn_nocb_kthreads(void) -{ - int cpu; - - if (rcu_nocb_is_setup) { - for_each_online_cpu(cpu) - rcu_spawn_cpu_nocb_kthread(cpu); - } -} - /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ static int rcu_nocb_gp_stride = -1; module_param(rcu_nocb_gp_stride, int, 0444); @@ -1549,10 +1522,6 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) { } -static void __init rcu_spawn_nocb_kthreads(void) -{ -} - static void show_rcu_nocb_state(struct rcu_data *rdp) { } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8360d86db1c0..c8ba0fe17267 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -486,6 +486,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) t->rcu_read_unlock_special.s = 0; if (special.b.need_qs) { if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); udelay(rcu_unlock_delay); } else { @@ -660,7 +661,13 @@ static void rcu_read_unlock_special(struct task_struct *t) expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - init_irq_work(&rdp->defer_qs_iw, rcu_preempt_deferred_qs_handler); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && + IS_ENABLED(CONFIG_PREEMPT_RT)) + rdp->defer_qs_iw = IRQ_WORK_INIT_HARD( + rcu_preempt_deferred_qs_handler); + else + init_irq_work(&rdp->defer_qs_iw, + rcu_preempt_deferred_qs_handler); rdp->defer_qs_iw_pending = true; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } @@ -1124,7 +1131,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { raw_lockdep_assert_held_rcu_node(rnp); - if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { + if (!rnp->boost_kthread_task || + (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -1226,18 +1234,6 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) free_cpumask_var(cm); } -/* - * Spawn boost kthreads -- called as soon as the scheduler is running. - */ -static void __init rcu_spawn_boost_kthreads(void) -{ - struct rcu_node *rnp; - - rcu_for_each_leaf_node(rnp) - if (rcu_rnp_online_cpus(rnp)) - rcu_spawn_one_boost_kthread(rnp); -} - #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) @@ -1263,10 +1259,6 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { } -static void __init rcu_spawn_boost_kthreads(void) -{ -} - #endif /* #else #ifdef CONFIG_RCU_BOOST */ /* diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 0c5d8516516a..a001e1e7a992 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -25,6 +25,34 @@ int sysctl_max_rcu_stall_to_panic __read_mostly; #define RCU_STALL_MIGHT_DIV 8 #define RCU_STALL_MIGHT_MIN (2 * HZ) +int rcu_exp_jiffies_till_stall_check(void) +{ + int cpu_stall_timeout = READ_ONCE(rcu_exp_cpu_stall_timeout); + int exp_stall_delay_delta = 0; + int till_stall_check; + + // Zero says to use rcu_cpu_stall_timeout, but in milliseconds. + if (!cpu_stall_timeout) + cpu_stall_timeout = jiffies_to_msecs(rcu_jiffies_till_stall_check()); + + // Limit check must be consistent with the Kconfig limits for + // CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range. + // The minimum clamped value is "2UL", because at least one full + // tick has to be guaranteed. + till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 21UL * HZ); + + if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout) + WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check)); + +#ifdef CONFIG_PROVE_RCU + /* Add extra ~25% out of till_stall_check. */ + exp_stall_delay_delta = ((till_stall_check * 25) / 100) + 1; +#endif + + return till_stall_check + exp_stall_delay_delta; +} +EXPORT_SYMBOL_GPL(rcu_exp_jiffies_till_stall_check); + /* Limit-check stall timeouts specified at boottime and runtime. */ int rcu_jiffies_till_stall_check(void) { @@ -565,9 +593,9 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", + pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n", smp_processor_id(), (long)(jiffies - gps), - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); if (ndetected) { rcu_dump_cpu_stacks(); @@ -626,9 +654,9 @@ static void print_cpu_stall(unsigned long gps) raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", + pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n", jiffies - gps, - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 180ff9c41fa8..fc7fef575606 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -506,6 +506,8 @@ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); +int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT; +module_param(rcu_exp_cpu_stall_timeout, int, 0644); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ // Suppress boot-time RCU CPU stall warnings and rcutorture writer stall diff --git a/kernel/scftorture.c b/kernel/scftorture.c index dcb0410950e4..5d113aa59e77 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -267,9 +267,10 @@ static void scf_handler(void *scfc_in) } this_cpu_inc(scf_invoked_count); if (longwait <= 0) { - if (!(r & 0xffc0)) + if (!(r & 0xffc0)) { udelay(r & 0x3f); - goto out; + goto out; + } } if (r & 0xfff) goto out; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d58c0389eb23..2a05096559a2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8415,6 +8415,18 @@ static void __init preempt_dynamic_init(void) } } +#define PREEMPT_MODEL_ACCESSOR(mode) \ + bool preempt_model_##mode(void) \ + { \ + WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ + return preempt_dynamic_mode == preempt_dynamic_##mode; \ + } \ + EXPORT_SYMBOL_GPL(preempt_model_##mode) + +PREEMPT_MODEL_ACCESSOR(none); +PREEMPT_MODEL_ACCESSOR(voluntary); +PREEMPT_MODEL_ACCESSOR(full); + #else /* !CONFIG_PREEMPT_DYNAMIC */ static inline void preempt_dynamic_init(void) { } diff --git a/kernel/smp.c b/kernel/smp.c index 65a630f62363..df9393aeae28 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -183,7 +183,9 @@ static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); static DEFINE_PER_CPU(struct cfd_seq_local, cfd_seq_local); -#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC) +static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ +module_param(csd_lock_timeout, ulong, 0444); + static atomic_t csd_bug_count = ATOMIC_INIT(0); static u64 cfd_seq; @@ -329,6 +331,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * u64 ts2, ts_delta; call_single_data_t *cpu_cur_csd; unsigned int flags = READ_ONCE(csd->node.u_flags); + unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC; if (!(flags & CSD_FLAG_LOCK)) { if (!unlikely(*bug_id)) @@ -341,7 +344,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * ts2 = sched_clock(); ts_delta = ts2 - *ts1; - if (likely(ts_delta <= CSD_LOCK_TIMEOUT)) + if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0)) return false; firsttime = !*bug_id; diff --git a/kernel/task_work.c b/kernel/task_work.c index c59e1a49bc40..dff75bcde151 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -12,12 +12,22 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * @notify: how to notify the targeted task * * Queue @work for task_work_run() below and notify the @task if @notify - * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the - * it will interrupt the targeted task and run the task_work. @TWA_RESUME - * work is run only when the task exits the kernel and returns to user mode, - * or before entering guest mode. Fails if the @task is exiting/exited and thus - * it can't process this @work. Otherwise @work->func() will be called when the - * @task goes through one of the aforementioned transitions, or exits. + * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI. + * + * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted + * task and run the task_work, regardless of whether the task is currently + * running in the kernel or userspace. + * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a + * reschedule IPI to force the targeted task to reschedule and run task_work. + * This can be advantageous if there's no strict requirement that the + * task_work be run as soon as possible, just whenever the task enters the + * kernel anyway. + * @TWA_RESUME work is run only when the task exits the kernel and returns to + * user mode, or before entering guest mode. + * + * Fails if the @task is exiting/exited and thus it can't process this @work. + * Otherwise @work->func() will be called when the @task goes through one of + * the aforementioned transitions, or exits. * * If the targeted task is exiting, then an error is returned and the work item * is not queued. It's up to the caller to arrange for an alternative mechanism @@ -53,6 +63,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work, case TWA_SIGNAL: set_notify_signal(task); break; + case TWA_SIGNAL_NO_IPI: + __set_notify_signal(task); + break; default: WARN_ON_ONCE(1); break; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2c43e327a619..bf5da6c4e999 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -144,6 +144,7 @@ config TRACING select BINARY_PRINTF select EVENT_TRACING select TRACE_CLOCK + select TASKS_RCU if PREEMPTION config GENERIC_TRACER bool diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index af9302141bcf..e5c5315da274 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -76,6 +76,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, data = kzalloc(sizeof(*ref->data), gfp); if (!data) { free_percpu((void __percpu *)ref->percpu_count_ptr); + ref->percpu_count_ptr = 0; return -ENOMEM; } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 196417859c4a..68b3e850bcb9 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -39,6 +39,13 @@ static int br_pass_frame_up(struct sk_buff *skb) dev_sw_netstats_rx_add(brdev, skb->len); vg = br_vlan_group_rcu(br); + + /* Reset the offload_fwd_mark because there could be a stacked + * bridge above, and it should not think this bridge it doing + * that bridge's work forwarding out its ports. + */ + br_switchdev_frame_unmark(skb); + /* Bridge is just like any other port. Make sure the * packet is allowed except in promisc mode when someone * may be running packet capture. diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 83eb97c94e83..9d82bb42e958 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -537,43 +537,6 @@ static void request_init(struct ceph_osd_request *req) target_init(&req->r_t); } -/* - * This is ugly, but it allows us to reuse linger registration and ping - * requests, keeping the structure of the code around send_linger{_ping}() - * reasonable. Setting up a min_nr=2 mempool for each linger request - * and dealing with copying ops (this blasts req only, watch op remains - * intact) isn't any better. - */ -static void request_reinit(struct ceph_osd_request *req) -{ - struct ceph_osd_client *osdc = req->r_osdc; - bool mempool = req->r_mempool; - unsigned int num_ops = req->r_num_ops; - u64 snapid = req->r_snapid; - struct ceph_snap_context *snapc = req->r_snapc; - bool linger = req->r_linger; - struct ceph_msg *request_msg = req->r_request; - struct ceph_msg *reply_msg = req->r_reply; - - dout("%s req %p\n", __func__, req); - WARN_ON(kref_read(&req->r_kref) != 1); - request_release_checks(req); - - WARN_ON(kref_read(&request_msg->kref) != 1); - WARN_ON(kref_read(&reply_msg->kref) != 1); - target_destroy(&req->r_t); - - request_init(req); - req->r_osdc = osdc; - req->r_mempool = mempool; - req->r_num_ops = num_ops; - req->r_snapid = snapid; - req->r_snapc = snapc; - req->r_linger = linger; - req->r_request = request_msg; - req->r_reply = reply_msg; -} - struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, unsigned int num_ops, @@ -918,14 +881,30 @@ EXPORT_SYMBOL(osd_req_op_xattr_init); * @watch_opcode: CEPH_OSD_WATCH_OP_* */ static void osd_req_op_watch_init(struct ceph_osd_request *req, int which, - u64 cookie, u8 watch_opcode) + u8 watch_opcode, u64 cookie, u32 gen) { struct ceph_osd_req_op *op; op = osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0); op->watch.cookie = cookie; op->watch.op = watch_opcode; - op->watch.gen = 0; + op->watch.gen = gen; +} + +/* + * prot_ver, timeout and notify payload (may be empty) should already be + * encoded in @request_pl + */ +static void osd_req_op_notify_init(struct ceph_osd_request *req, int which, + u64 cookie, struct ceph_pagelist *request_pl) +{ + struct ceph_osd_req_op *op; + + op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); + op->notify.cookie = cookie; + + ceph_osd_data_pagelist_init(&op->notify.request_data, request_pl); + op->indata_len = request_pl->length; } /* @@ -2731,10 +2710,13 @@ static void linger_release(struct kref *kref) WARN_ON(!list_empty(&lreq->pending_lworks)); WARN_ON(lreq->osd); - if (lreq->reg_req) - ceph_osdc_put_request(lreq->reg_req); - if (lreq->ping_req) - ceph_osdc_put_request(lreq->ping_req); + if (lreq->request_pl) + ceph_pagelist_release(lreq->request_pl); + if (lreq->notify_id_pages) + ceph_release_page_vector(lreq->notify_id_pages, 1); + + ceph_osdc_put_request(lreq->reg_req); + ceph_osdc_put_request(lreq->ping_req); target_destroy(&lreq->t); kfree(lreq); } @@ -3003,6 +2985,12 @@ static void linger_commit_cb(struct ceph_osd_request *req) struct ceph_osd_linger_request *lreq = req->r_priv; mutex_lock(&lreq->lock); + if (req != lreq->reg_req) { + dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n", + __func__, lreq, lreq->linger_id, req, lreq->reg_req); + goto out; + } + dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, lreq->linger_id, req->r_result); linger_reg_commit_complete(lreq, req->r_result); @@ -3026,6 +3014,7 @@ static void linger_commit_cb(struct ceph_osd_request *req) } } +out: mutex_unlock(&lreq->lock); linger_put(lreq); } @@ -3048,6 +3037,12 @@ static void linger_reconnect_cb(struct ceph_osd_request *req) struct ceph_osd_linger_request *lreq = req->r_priv; mutex_lock(&lreq->lock); + if (req != lreq->reg_req) { + dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n", + __func__, lreq, lreq->linger_id, req, lreq->reg_req); + goto out; + } + dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__, lreq, lreq->linger_id, req->r_result, lreq->last_error); if (req->r_result < 0) { @@ -3057,46 +3052,64 @@ static void linger_reconnect_cb(struct ceph_osd_request *req) } } +out: mutex_unlock(&lreq->lock); linger_put(lreq); } static void send_linger(struct ceph_osd_linger_request *lreq) { - struct ceph_osd_request *req = lreq->reg_req; - struct ceph_osd_req_op *op = &req->r_ops[0]; + struct ceph_osd_client *osdc = lreq->osdc; + struct ceph_osd_request *req; + int ret; - verify_osdc_wrlocked(req->r_osdc); + verify_osdc_wrlocked(osdc); + mutex_lock(&lreq->lock); dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); - if (req->r_osd) - cancel_linger_request(req); + if (lreq->reg_req) { + if (lreq->reg_req->r_osd) + cancel_linger_request(lreq->reg_req); + ceph_osdc_put_request(lreq->reg_req); + } + + req = ceph_osdc_alloc_request(osdc, NULL, 1, true, GFP_NOIO); + BUG_ON(!req); - request_reinit(req); target_copy(&req->r_t, &lreq->t); req->r_mtime = lreq->mtime; - mutex_lock(&lreq->lock); if (lreq->is_watch && lreq->committed) { - WARN_ON(op->op != CEPH_OSD_OP_WATCH || - op->watch.cookie != lreq->linger_id); - op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT; - op->watch.gen = ++lreq->register_gen; + osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_RECONNECT, + lreq->linger_id, ++lreq->register_gen); dout("lreq %p reconnect register_gen %u\n", lreq, - op->watch.gen); + req->r_ops[0].watch.gen); req->r_callback = linger_reconnect_cb; } else { - if (!lreq->is_watch) + if (lreq->is_watch) { + osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_WATCH, + lreq->linger_id, 0); + } else { lreq->notify_id = 0; - else - WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH); + + refcount_inc(&lreq->request_pl->refcnt); + osd_req_op_notify_init(req, 0, lreq->linger_id, + lreq->request_pl); + ceph_osd_data_pages_init( + osd_req_op_data(req, 0, notify, response_data), + lreq->notify_id_pages, PAGE_SIZE, 0, false, false); + } dout("lreq %p register\n", lreq); req->r_callback = linger_commit_cb; } - mutex_unlock(&lreq->lock); + + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + BUG_ON(ret); req->r_priv = linger_get(lreq); req->r_linger = true; + lreq->reg_req = req; + mutex_unlock(&lreq->lock); submit_request(req, true); } @@ -3106,6 +3119,12 @@ static void linger_ping_cb(struct ceph_osd_request *req) struct ceph_osd_linger_request *lreq = req->r_priv; mutex_lock(&lreq->lock); + if (req != lreq->ping_req) { + dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n", + __func__, lreq, lreq->linger_id, req, lreq->ping_req); + goto out; + } + dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n", __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent, lreq->last_error); @@ -3121,6 +3140,7 @@ static void linger_ping_cb(struct ceph_osd_request *req) lreq->register_gen, req->r_ops[0].watch.gen); } +out: mutex_unlock(&lreq->lock); linger_put(lreq); } @@ -3128,8 +3148,8 @@ static void linger_ping_cb(struct ceph_osd_request *req) static void send_linger_ping(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; - struct ceph_osd_request *req = lreq->ping_req; - struct ceph_osd_req_op *op = &req->r_ops[0]; + struct ceph_osd_request *req; + int ret; if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) { dout("%s PAUSERD\n", __func__); @@ -3141,19 +3161,26 @@ static void send_linger_ping(struct ceph_osd_linger_request *lreq) __func__, lreq, lreq->linger_id, lreq->ping_sent, lreq->register_gen); - if (req->r_osd) - cancel_linger_request(req); + if (lreq->ping_req) { + if (lreq->ping_req->r_osd) + cancel_linger_request(lreq->ping_req); + ceph_osdc_put_request(lreq->ping_req); + } - request_reinit(req); - target_copy(&req->r_t, &lreq->t); + req = ceph_osdc_alloc_request(osdc, NULL, 1, true, GFP_NOIO); + BUG_ON(!req); - WARN_ON(op->op != CEPH_OSD_OP_WATCH || - op->watch.cookie != lreq->linger_id || - op->watch.op != CEPH_OSD_WATCH_OP_PING); - op->watch.gen = lreq->register_gen; + target_copy(&req->r_t, &lreq->t); + osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_PING, lreq->linger_id, + lreq->register_gen); req->r_callback = linger_ping_cb; + + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + BUG_ON(ret); + req->r_priv = linger_get(lreq); req->r_linger = true; + lreq->ping_req = req; ceph_osdc_get_request(req); account_request(req); @@ -3169,12 +3196,6 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) down_write(&osdc->lock); linger_register(lreq); - if (lreq->is_watch) { - lreq->reg_req->r_ops[0].watch.cookie = lreq->linger_id; - lreq->ping_req->r_ops[0].watch.cookie = lreq->linger_id; - } else { - lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; - } calc_target(osdc, &lreq->t, false); osd = lookup_create_osd(osdc, lreq->t.osd, true); @@ -3206,9 +3227,9 @@ static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq) */ static void __linger_cancel(struct ceph_osd_linger_request *lreq) { - if (lreq->is_watch && lreq->ping_req->r_osd) + if (lreq->ping_req && lreq->ping_req->r_osd) cancel_linger_request(lreq->ping_req); - if (lreq->reg_req->r_osd) + if (lreq->reg_req && lreq->reg_req->r_osd) cancel_linger_request(lreq->reg_req); cancel_linger_map_check(lreq); unlink_linger(lreq->osd, lreq); @@ -4570,8 +4591,13 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, EXPORT_SYMBOL(ceph_osdc_start_request); /* - * Unregister a registered request. The request is not completed: - * ->r_result isn't set and __complete_request() isn't called. + * Unregister request. If @req was registered, it isn't completed: + * r_result isn't set and __complete_request() isn't invoked. + * + * If @req wasn't registered, this call may have raced with + * handle_reply(), in which case r_result would already be set and + * __complete_request() would be getting invoked, possibly even + * concurrently with this call. */ void ceph_osdc_cancel_request(struct ceph_osd_request *req) { @@ -4657,43 +4683,6 @@ again: } EXPORT_SYMBOL(ceph_osdc_sync); -static struct ceph_osd_request * -alloc_linger_request(struct ceph_osd_linger_request *lreq) -{ - struct ceph_osd_request *req; - - req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO); - if (!req) - return NULL; - - ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); - ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); - return req; -} - -static struct ceph_osd_request * -alloc_watch_request(struct ceph_osd_linger_request *lreq, u8 watch_opcode) -{ - struct ceph_osd_request *req; - - req = alloc_linger_request(lreq); - if (!req) - return NULL; - - /* - * Pass 0 for cookie because we don't know it yet, it will be - * filled in by linger_submit(). - */ - osd_req_op_watch_init(req, 0, 0, watch_opcode); - - if (ceph_osdc_alloc_messages(req, GFP_NOIO)) { - ceph_osdc_put_request(req); - return NULL; - } - - return req; -} - /* * Returns a handle, caller owns a ref. */ @@ -4723,18 +4712,6 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, lreq->t.flags = CEPH_OSD_FLAG_WRITE; ktime_get_real_ts64(&lreq->mtime); - lreq->reg_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_WATCH); - if (!lreq->reg_req) { - ret = -ENOMEM; - goto err_put_lreq; - } - - lreq->ping_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_PING); - if (!lreq->ping_req) { - ret = -ENOMEM; - goto err_put_lreq; - } - linger_submit(lreq); ret = linger_reg_commit_wait(lreq); if (ret) { @@ -4772,8 +4749,8 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc, ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); req->r_flags = CEPH_OSD_FLAG_WRITE; ktime_get_real_ts64(&req->r_mtime); - osd_req_op_watch_init(req, 0, lreq->linger_id, - CEPH_OSD_WATCH_OP_UNWATCH); + osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_UNWATCH, + lreq->linger_id, 0); ret = ceph_osdc_alloc_messages(req, GFP_NOIO); if (ret) @@ -4859,35 +4836,6 @@ out_put_req: } EXPORT_SYMBOL(ceph_osdc_notify_ack); -static int osd_req_op_notify_init(struct ceph_osd_request *req, int which, - u64 cookie, u32 prot_ver, u32 timeout, - void *payload, u32 payload_len) -{ - struct ceph_osd_req_op *op; - struct ceph_pagelist *pl; - int ret; - - op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); - op->notify.cookie = cookie; - - pl = ceph_pagelist_alloc(GFP_NOIO); - if (!pl) - return -ENOMEM; - - ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */ - ret |= ceph_pagelist_encode_32(pl, timeout); - ret |= ceph_pagelist_encode_32(pl, payload_len); - ret |= ceph_pagelist_append(pl, payload, payload_len); - if (ret) { - ceph_pagelist_release(pl); - return -ENOMEM; - } - - ceph_osd_data_pagelist_init(&op->notify.request_data, pl); - op->indata_len = pl->length; - return 0; -} - /* * @timeout: in seconds * @@ -4906,7 +4854,6 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, size_t *preply_len) { struct ceph_osd_linger_request *lreq; - struct page **pages; int ret; WARN_ON(!timeout); @@ -4919,41 +4866,35 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, if (!lreq) return -ENOMEM; - lreq->preply_pages = preply_pages; - lreq->preply_len = preply_len; - - ceph_oid_copy(&lreq->t.base_oid, oid); - ceph_oloc_copy(&lreq->t.base_oloc, oloc); - lreq->t.flags = CEPH_OSD_FLAG_READ; - - lreq->reg_req = alloc_linger_request(lreq); - if (!lreq->reg_req) { + lreq->request_pl = ceph_pagelist_alloc(GFP_NOIO); + if (!lreq->request_pl) { ret = -ENOMEM; goto out_put_lreq; } - /* - * Pass 0 for cookie because we don't know it yet, it will be - * filled in by linger_submit(). - */ - ret = osd_req_op_notify_init(lreq->reg_req, 0, 0, 1, timeout, - payload, payload_len); - if (ret) + ret = ceph_pagelist_encode_32(lreq->request_pl, 1); /* prot_ver */ + ret |= ceph_pagelist_encode_32(lreq->request_pl, timeout); + ret |= ceph_pagelist_encode_32(lreq->request_pl, payload_len); + ret |= ceph_pagelist_append(lreq->request_pl, payload, payload_len); + if (ret) { + ret = -ENOMEM; goto out_put_lreq; + } /* for notify_id */ - pages = ceph_alloc_page_vector(1, GFP_NOIO); - if (IS_ERR(pages)) { - ret = PTR_ERR(pages); + lreq->notify_id_pages = ceph_alloc_page_vector(1, GFP_NOIO); + if (IS_ERR(lreq->notify_id_pages)) { + ret = PTR_ERR(lreq->notify_id_pages); + lreq->notify_id_pages = NULL; goto out_put_lreq; } - ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify, - response_data), - pages, PAGE_SIZE, 0, false, true); - ret = ceph_osdc_alloc_messages(lreq->reg_req, GFP_NOIO); - if (ret) - goto out_put_lreq; + lreq->preply_pages = preply_pages; + lreq->preply_len = preply_len; + + ceph_oid_copy(&lreq->t.base_oid, oid); + ceph_oloc_copy(&lreq->t.base_oloc, oloc); + lreq->t.flags = CEPH_OSD_FLAG_READ; linger_submit(lreq); ret = linger_reg_commit_wait(lreq); diff --git a/net/core/dev.c b/net/core/dev.c index 1461c2d9dec8..2771fd22dc6a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -681,11 +681,11 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, const struct net_device *last_dev; struct net_device_path_ctx ctx = { .dev = dev, - .daddr = daddr, }; struct net_device_path *path; int ret = 0; + memcpy(ctx.daddr, daddr, sizeof(ctx.daddr)); stack->num_paths = 0; while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { last_dev = ctx.dev; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index ae662567a6cb..0ea29270d7e5 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1030,9 +1030,15 @@ static void __net_exit dccp_v4_exit_net(struct net *net) inet_ctl_sock_destroy(pn->v4_ctl_sk); } +static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list) +{ + inet_twsk_purge(&dccp_hashinfo, AF_INET); +} + static struct pernet_operations dccp_v4_ops = { .init = dccp_v4_init_net, .exit = dccp_v4_exit_net, + .exit_batch = dccp_v4_exit_batch, .id = &dccp_v4_pernet_id, .size = sizeof(struct dccp_v4_pernet), }; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index eab3bd1ee9a0..fa663518fa0e 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1115,9 +1115,15 @@ static void __net_exit dccp_v6_exit_net(struct net *net) inet_ctl_sock_destroy(pn->v6_ctl_sk); } +static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list) +{ + inet_twsk_purge(&dccp_hashinfo, AF_INET6); +} + static struct pernet_operations dccp_v6_ops = { .init = dccp_v6_init_net, .exit = dccp_v6_exit_net, + .exit_batch = dccp_v6_exit_batch, .id = &dccp_v6_pernet_id, .size = sizeof(struct dccp_v6_pernet), }; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 9e0bbd026560..0ec501845cb3 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -52,7 +52,8 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) spin_unlock(lock); /* Disassociate with bind bucket. */ - bhead = &hashinfo->bhash[tw->tw_bslot]; + bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, + hashinfo->bhash_size)]; spin_lock(&bhead->lock); inet_twsk_bind_unhash(tw, hashinfo); @@ -111,12 +112,8 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ - /* Cache inet_bhashfn(), because 'struct net' might be no longer - * available later in inet_twsk_kill(). - */ - tw->tw_bslot = inet_bhashfn(twsk_net(tw), inet->inet_num, - hashinfo->bhash_size); - bhead = &hashinfo->bhash[tw->tw_bslot]; + bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, + hashinfo->bhash_size)]; spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); @@ -257,3 +254,50 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) } } EXPORT_SYMBOL_GPL(__inet_twsk_schedule); + +void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) +{ + struct inet_timewait_sock *tw; + struct sock *sk; + struct hlist_nulls_node *node; + unsigned int slot; + + for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; +restart_rcu: + cond_resched(); + rcu_read_lock(); +restart: + sk_nulls_for_each_rcu(sk, node, &head->chain) { + if (sk->sk_state != TCP_TIME_WAIT) + continue; + tw = inet_twsk(sk); + if ((tw->tw_family != family) || + refcount_read(&twsk_net(tw)->ns.count)) + continue; + + if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt))) + continue; + + if (unlikely((tw->tw_family != family) || + refcount_read(&twsk_net(tw)->ns.count))) { + inet_twsk_put(tw); + goto restart; + } + + rcu_read_unlock(); + local_bh_disable(); + inet_twsk_deschedule_put(tw); + local_bh_enable(); + goto restart_rcu; + } + /* If the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto restart; + rcu_read_unlock(); + } +} +EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 57abd27e842c..ed01063d8f30 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1726,6 +1726,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; struct rtable *rth; + bool no_policy; u32 itag = 0; int err; @@ -1736,8 +1737,12 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (our) flags |= RTCF_LOCAL; + no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY); + if (no_policy) + IPCB(skb)->flags |= IPSKB_NOPOLICY; + rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, - IN_DEV_ORCONF(in_dev, NOPOLICY), false); + no_policy, false); if (!rth) return -ENOBUFS; @@ -1796,7 +1801,7 @@ static int __mkroute_input(struct sk_buff *skb, struct rtable *rth; int err; struct in_device *out_dev; - bool do_cache; + bool do_cache, no_policy; u32 itag = 0; /* get a working reference to the output device */ @@ -1841,6 +1846,10 @@ static int __mkroute_input(struct sk_buff *skb, } } + no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY); + if (no_policy) + IPCB(skb)->flags |= IPSKB_NOPOLICY; + fnhe = find_exception(nhc, daddr); if (do_cache) { if (fnhe) @@ -1853,8 +1862,7 @@ static int __mkroute_input(struct sk_buff *skb, } } - rth = rt_dst_alloc(out_dev->dev, 0, res->type, - IN_DEV_ORCONF(in_dev, NOPOLICY), + rth = rt_dst_alloc(out_dev->dev, 0, res->type, no_policy, IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { err = -ENOBUFS; @@ -2229,6 +2237,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct rtable *rth; struct flowi4 fl4; bool do_cache = true; + bool no_policy; /* IP on this device is disabled. */ @@ -2347,6 +2356,10 @@ brd_input: RT_CACHE_STAT_INC(in_brd); local_input: + no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY); + if (no_policy) + IPCB(skb)->flags |= IPSKB_NOPOLICY; + do_cache &= res->fi && !itag; if (do_cache) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); @@ -2361,7 +2374,7 @@ local_input: rth = rt_dst_alloc(ip_rt_get_dev(net, res), flags | RTCF_LOCAL, res->type, - IN_DEV_ORCONF(in_dev, NOPOLICY), false); + no_policy, false); if (!rth) goto e_nobufs; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cf18fbcbf123..bb7ef45408e1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2335,8 +2335,10 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, if (sk->sk_state == TCP_LISTEN) goto out; - if (tp->recvmsg_inq) + if (tp->recvmsg_inq) { *cmsg_flags = TCP_CMSG_INQ; + msg->msg_get_inq = 1; + } timeo = sock_rcvtimeo(sk, nonblock); /* Urgent data needs to be handled specially. */ @@ -2559,7 +2561,7 @@ recv_sndq: int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { - int cmsg_flags = 0, ret, inq; + int cmsg_flags = 0, ret; struct scm_timestamping_internal tss; if (unlikely(flags & MSG_ERRQUEUE)) @@ -2576,12 +2578,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, release_sock(sk); sk_defer_free_flush(sk); - if (cmsg_flags && ret >= 0) { + if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) { if (cmsg_flags & TCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); - if (cmsg_flags & TCP_CMSG_INQ) { - inq = tcp_inq_hint(sk); - put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); + if (msg->msg_get_inq) { + msg->msg_inq = tcp_inq_hint(sk); + if (cmsg_flags & TCP_CMSG_INQ) + put_cmsg(msg, SOL_TCP, TCP_CM_INQ, + sizeof(msg->msg_inq), &msg->msg_inq); } } return ret; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f9cec624068d..457f5b5d5d4a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3173,6 +3173,8 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { struct net *net; + inet_twsk_purge(&tcp_hashinfo, AF_INET); + list_for_each_entry(net, net_exit_list, exit_list) tcp_fastopen_ctx_destroy(net); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 13678d3908fa..faaddaf43c90 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2207,9 +2207,15 @@ static void __net_exit tcpv6_net_exit(struct net *net) inet_ctl_sock_destroy(net->ipv6.tcp_sk); } +static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) +{ + inet_twsk_purge(&tcp_hashinfo, AF_INET6); +} + static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, + .exit_batch = tcpv6_net_exit_batch, }; int __init tcpv6_init(void) diff --git a/net/key/af_key.c b/net/key/af_key.c index fd51db3be91c..339d95df19d3 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2826,8 +2826,10 @@ static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb void *ext_hdrs[SADB_EXT_MAX]; int err; - pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, - BROADCAST_PROMISC_ONLY, NULL, sock_net(sk)); + err = pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, + BROADCAST_PROMISC_ONLY, NULL, sock_net(sk)); + if (err) + return err; memset(ext_hdrs, 0, sizeof(ext_hdrs)); err = parse_exthdrs(skb, hdr, ext_hdrs); @@ -2898,7 +2900,7 @@ static int count_ah_combs(const struct xfrm_tmpl *t) break; if (!aalg->pfkey_supported) continue; - if (aalg_tmpl_set(t, aalg)) + if (aalg_tmpl_set(t, aalg) && aalg->available) sz += sizeof(struct sadb_comb); } return sz + sizeof(struct sadb_prop); @@ -2916,7 +2918,7 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!ealg->pfkey_supported) continue; - if (!(ealg_tmpl_set(t, ealg))) + if (!(ealg_tmpl_set(t, ealg) && ealg->available)) continue; for (k = 1; ; k++) { @@ -2927,7 +2929,7 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!aalg->pfkey_supported) continue; - if (aalg_tmpl_set(t, aalg)) + if (aalg_tmpl_set(t, aalg) && aalg->available) sz += sizeof(struct sadb_comb); } } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 325383646f5c..b548cec86c9d 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -107,7 +107,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 2; } if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) { - mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); + mp_opt->csum = get_unaligned((__force __sum16 *)ptr); mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; ptr += 2; } @@ -221,7 +221,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) { mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; - mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); + mp_opt->csum = get_unaligned((__force __sum16 *)ptr); ptr += 2; } @@ -1240,7 +1240,7 @@ static void mptcp_set_rwin(const struct tcp_sock *tp) WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); } -u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) +__sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) { struct csum_pseudo_header header; __wsum csum; @@ -1256,15 +1256,25 @@ u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) header.csum = 0; csum = csum_partial(&header, sizeof(header), sum); - return (__force u16)csum_fold(csum); + return csum_fold(csum); } -static u16 mptcp_make_csum(const struct mptcp_ext *mpext) +static __sum16 mptcp_make_csum(const struct mptcp_ext *mpext) { return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len, ~csum_unfold(mpext->csum)); } +static void put_len_csum(u16 len, __sum16 csum, void *data) +{ + __sum16 *sumptr = data + 2; + __be16 *ptr = data; + + put_unaligned_be16(len, ptr); + + put_unaligned(csum, sumptr); +} + void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, struct mptcp_out_options *opts) { @@ -1340,8 +1350,9 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, put_unaligned_be32(mpext->subflow_seq, ptr); ptr += 1; if (opts->csum_reqd) { - put_unaligned_be32(mpext->data_len << 16 | - mptcp_make_csum(mpext), ptr); + put_len_csum(mpext->data_len, + mptcp_make_csum(mpext), + ptr); } else { put_unaligned_be32(mpext->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); @@ -1392,11 +1403,12 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, goto mp_capable_done; if (opts->csum_reqd) { - put_unaligned_be32(opts->data_len << 16 | - __mptcp_make_csum(opts->data_seq, - opts->subflow_seq, - opts->data_len, - ~csum_unfold(opts->csum)), ptr); + put_len_csum(opts->data_len, + __mptcp_make_csum(opts->data_seq, + opts->subflow_seq, + opts->data_len, + ~csum_unfold(opts->csum)), + ptr); } else { put_unaligned_be32(opts->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 01809eef29b4..aa51b100e033 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -178,14 +178,13 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk, struct mptcp_pm_data *pm = &msk->pm; bool update_subflows; - update_subflows = (ssk->sk_state == TCP_CLOSE) && - (subflow->request_join || subflow->mp_join); + update_subflows = subflow->request_join || subflow->mp_join; if (!READ_ONCE(pm->work_pending) && !update_subflows) return; spin_lock_bh(&pm->lock); if (update_subflows) - pm->subflows--; + __mptcp_pm_close_subflow(msk); /* Even if this subflow is not really established, tell the PM to try * to pick the next ones, if possible. diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 3c1a3036550f..5655a63aa6a8 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -443,7 +443,8 @@ struct mptcp_subflow_context { can_ack : 1, /* only after processing the remote a key */ disposable : 1, /* ctx can be free at ulp release time */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ - local_id_valid : 1; /* local_id is correctly initialized */ + local_id_valid : 1, /* local_id is correctly initialized */ + valid_csum_seen : 1; /* at least one csum validated */ enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; @@ -723,7 +724,7 @@ void mptcp_token_destroy(struct mptcp_sock *msk); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); -u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum); +__sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum); void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); @@ -833,6 +834,20 @@ unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); +/* called under PM lock */ +static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) +{ + if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk)) + WRITE_ONCE(msk->pm.accept_subflow, true); +} + +static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk) +{ + spin_lock_bh(&msk->pm.lock); + __mptcp_pm_close_subflow(msk); + spin_unlock_bh(&msk->pm.lock); +} + void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk); void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index aba260f547da..be76ada89d96 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -888,7 +888,7 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff * { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 offset, seq, delta; - u16 csum; + __sum16 csum; int len; if (!csum_reqd) @@ -955,11 +955,14 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff * subflow->map_data_csum); if (unlikely(csum)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); - subflow->send_mp_fail = 1; - MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPFAILTX); + if (subflow->mp_join || subflow->valid_csum_seen) { + subflow->send_mp_fail = 1; + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPFAILTX); + } return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY; } + subflow->valid_csum_seen = 1; return MAPPING_OK; } @@ -1141,6 +1144,18 @@ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ss } } +static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) +{ + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + if (subflow->mp_join) + return false; + else if (READ_ONCE(msk->csum_enabled)) + return !subflow->valid_csum_seen; + else + return !subflow->fully_established; +} + static bool subflow_check_data_avail(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); @@ -1218,7 +1233,7 @@ fallback: return true; } - if (subflow->mp_join || subflow->fully_established) { + if (!subflow_can_fallback(subflow)) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ @@ -1422,20 +1437,20 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, struct sockaddr_storage addr; int remote_id = remote->id; int local_id = loc->id; + int err = -ENOTCONN; struct socket *sf; struct sock *ssk; u32 remote_token; int addrlen; int ifindex; u8 flags; - int err; if (!mptcp_is_fully_established(sk)) - return -ENOTCONN; + goto err_out; err = mptcp_subflow_create_socket(sk, &sf); if (err) - return err; + goto err_out; ssk = sf->sk; subflow = mptcp_subflow_ctx(ssk); @@ -1492,6 +1507,12 @@ failed_unlink: failed: subflow->disposable = 1; sock_release(sf); + +err_out: + /* we account subflows before the creation, and this failures will not + * be caught by sk_state_change() + */ + mptcp_pm_close_subflow(msk); return err; } diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 3db256da919b..f2def06d1070 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -179,12 +179,11 @@ EXPORT_SYMBOL_GPL(flow_offload_route_init); static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) { - tcp->state = TCP_CONNTRACK_ESTABLISHED; tcp->seen[0].td_maxwin = 0; tcp->seen[1].td_maxwin = 0; } -static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) +static void flow_offload_fixup_ct(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); @@ -193,7 +192,9 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) if (l4num == IPPROTO_TCP) { struct nf_tcp_net *tn = nf_tcp_pernet(net); - timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; + flow_offload_fixup_tcp(&ct->proto.tcp); + + timeout = tn->timeouts[ct->proto.tcp.state]; timeout -= tn->offload_timeout; } else if (l4num == IPPROTO_UDP) { struct nf_udp_net *tn = nf_udp_pernet(net); @@ -211,18 +212,6 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout); } -static void flow_offload_fixup_ct_state(struct nf_conn *ct) -{ - if (nf_ct_protonum(ct) == IPPROTO_TCP) - flow_offload_fixup_tcp(&ct->proto.tcp); -} - -static void flow_offload_fixup_ct(struct nf_conn *ct) -{ - flow_offload_fixup_ct_state(ct); - flow_offload_fixup_ct_timeout(ct); -} - static void flow_offload_route_release(struct flow_offload *flow) { nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); @@ -335,8 +324,10 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, u32 timeout; timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); - if (READ_ONCE(flow->timeout) != timeout) + if (timeout - READ_ONCE(flow->timeout) > HZ) WRITE_ONCE(flow->timeout, timeout); + else + return; if (likely(!nf_flowtable_hw_offload(flow_table))) return; @@ -359,22 +350,14 @@ static void flow_offload_del(struct nf_flowtable *flow_table, rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, nf_flow_offload_rhash_params); - - clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); - - if (nf_flow_has_expired(flow)) - flow_offload_fixup_ct(flow->ct); - else - flow_offload_fixup_ct_timeout(flow->ct); - flow_offload_free(flow); } void flow_offload_teardown(struct flow_offload *flow) { + clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); set_bit(NF_FLOW_TEARDOWN, &flow->flags); - - flow_offload_fixup_ct_state(flow->ct); + flow_offload_fixup_ct(flow->ct); } EXPORT_SYMBOL_GPL(flow_offload_teardown); @@ -438,33 +421,12 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table, return err; } -static bool flow_offload_stale_dst(struct flow_offload_tuple *tuple) -{ - struct dst_entry *dst; - - if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || - tuple->xmit_type == FLOW_OFFLOAD_XMIT_XFRM) { - dst = tuple->dst_cache; - if (!dst_check(dst, tuple->dst_cookie)) - return true; - } - - return false; -} - -static bool nf_flow_has_stale_dst(struct flow_offload *flow) -{ - return flow_offload_stale_dst(&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple) || - flow_offload_stale_dst(&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple); -} - static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { if (nf_flow_has_expired(flow) || - nf_ct_is_dying(flow->ct) || - nf_flow_has_stale_dst(flow)) - set_bit(NF_FLOW_TEARDOWN, &flow->flags); + nf_ct_is_dying(flow->ct)) + flow_offload_teardown(flow); if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { if (test_bit(NF_FLOW_HW, &flow->flags)) { diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 32c0eb1b4821..b350fe9d00b0 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -248,6 +248,15 @@ static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) return true; } +static inline bool nf_flow_dst_check(struct flow_offload_tuple *tuple) +{ + if (tuple->xmit_type != FLOW_OFFLOAD_XMIT_NEIGH && + tuple->xmit_type != FLOW_OFFLOAD_XMIT_XFRM) + return true; + + return dst_check(tuple->dst_cache, tuple->dst_cookie); +} + static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, const struct nf_hook_state *state, struct dst_entry *dst) @@ -367,6 +376,11 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_state_check(flow, iph->protocol, skb, thoff)) return NF_ACCEPT; + if (!nf_flow_dst_check(&tuplehash->tuple)) { + flow_offload_teardown(flow); + return NF_ACCEPT; + } + if (skb_try_make_writable(skb, thoff + hdrsize)) return NF_DROP; @@ -624,6 +638,11 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff)) return NF_ACCEPT; + if (!nf_flow_dst_check(&tuplehash->tuple)) { + flow_offload_teardown(flow); + return NF_ACCEPT; + } + if (skb_try_make_writable(skb, thoff + hdrsize)) return NF_DROP; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 16c3a39689f4..a096b9fbbbdf 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8342,16 +8342,7 @@ EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work); static bool nft_expr_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { - if (!expr->ops->reduce) { - pr_warn_once("missing reduce for expression %s ", - expr->ops->type->name); - return false; - } - - if (nft_reduce_is_readonly(expr)) - return false; - - return expr->ops->reduce(track, expr); + return false; } static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 900d48c810a1..6f0b07fe648d 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -36,6 +36,15 @@ static void nft_default_forward_path(struct nf_flow_route *route, route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); } +static bool nft_is_valid_ether_device(const struct net_device *dev) +{ + if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || + dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) + return false; + + return true; +} + static int nft_dev_fill_forward_path(const struct nf_flow_route *route, const struct dst_entry *dst_cache, const struct nf_conn *ct, @@ -47,6 +56,9 @@ static int nft_dev_fill_forward_path(const struct nf_flow_route *route, struct neighbour *n; u8 nud_state; + if (!nft_is_valid_ether_device(dev)) + goto out; + n = dst_neigh_lookup(dst_cache, daddr); if (!n) return -1; @@ -60,6 +72,7 @@ static int nft_dev_fill_forward_path(const struct nf_flow_route *route, if (!(nud_state & NUD_VALID)) return -1; +out: return dev_fill_forward_path(dev, ha, stack); } @@ -78,15 +91,6 @@ struct nft_forward_info { enum flow_offload_xmit_type xmit_type; }; -static bool nft_is_valid_ether_device(const struct net_device *dev) -{ - if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || - dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) - return false; - - return true; -} - static void nft_dev_path_info(const struct net_device_path_stack *stack, struct nft_forward_info *info, unsigned char *ha, struct nf_flowtable *flowtable) @@ -119,7 +123,8 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack, info->indev = NULL; break; } - info->outdev = path->dev; + if (!info->outdev) + info->outdev = path->dev; info->encap[info->num_encaps].id = path->encap.id; info->encap[info->num_encaps].proto = path->encap.proto; info->num_encaps++; @@ -293,7 +298,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, case IPPROTO_TCP: tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(_tcph), &_tcph); - if (unlikely(!tcph || tcph->fin || tcph->rst)) + if (unlikely(!tcph || tcph->fin || tcph->rst || + !nf_conntrack_tcp_established(ct))) goto out; break; case IPPROTO_UDP: diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index 6055dc9a82aa..aa5e712adf07 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -118,7 +118,7 @@ static int nci_queue_tx_data_frags(struct nci_dev *ndev, skb_frag = nci_skb_alloc(ndev, (NCI_DATA_HDR_SIZE + frag_len), - GFP_KERNEL); + GFP_ATOMIC); if (skb_frag == NULL) { rc = -ENOMEM; goto free_exit; diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index 19703a649b5a..78c4b6addf15 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -153,7 +153,7 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, i = 0; skb = nci_skb_alloc(ndev, conn_info->max_pkt_payload_len + - NCI_DATA_HDR_SIZE, GFP_KERNEL); + NCI_DATA_HDR_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; @@ -184,7 +184,7 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, if (i < data_len) { skb = nci_skb_alloc(ndev, conn_info->max_pkt_payload_len + - NCI_DATA_HDR_SIZE, GFP_KERNEL); + NCI_DATA_HDR_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 0eaaf1f45de1..211c757bfc3c 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -232,6 +232,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, for (i = 0; i < p->tcfp_nkeys; ++i) { u32 cur = p->tcfp_keys[i].off; + /* sanitize the shift value for any later use */ + p->tcfp_keys[i].shift = min_t(size_t, BITS_PER_TYPE(int) - 1, + p->tcfp_keys[i].shift); + /* The AT option can read a single byte, we can bound the actual * value with uchar max. */ diff --git a/net/socket.c b/net/socket.c index 6887840682bb..bb6a1a12fbde 100644 --- a/net/socket.c +++ b/net/socket.c @@ -504,7 +504,7 @@ static int sock_map_fd(struct socket *sock, int flags) struct socket *sock_from_file(struct file *file) { if (file->f_op == &socket_file_ops) - return file->private_data; /* set in sock_map_fd */ + return file->private_data; /* set in sock_alloc_file */ return NULL; } @@ -1538,11 +1538,10 @@ int sock_create_kern(struct net *net, int family, int type, int protocol, struct } EXPORT_SYMBOL(sock_create_kern); -int __sys_socket(int family, int type, int protocol) +static struct socket *__sys_socket_create(int family, int type, int protocol) { - int retval; struct socket *sock; - int flags; + int retval; /* Check the SOCK_* constants for consistency. */ BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); @@ -1550,17 +1549,50 @@ int __sys_socket(int family, int type, int protocol) BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); - flags = type & ~SOCK_TYPE_MASK; - if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; + if ((type & ~SOCK_TYPE_MASK) & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return ERR_PTR(-EINVAL); type &= SOCK_TYPE_MASK; + retval = sock_create(family, type, protocol, &sock); + if (retval < 0) + return ERR_PTR(retval); + + return sock; +} + +struct file *__sys_socket_file(int family, int type, int protocol) +{ + struct socket *sock; + struct file *file; + int flags; + + sock = __sys_socket_create(family, type, protocol); + if (IS_ERR(sock)) + return ERR_CAST(sock); + + flags = type & ~SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - retval = sock_create(family, type, protocol, &sock); - if (retval < 0) - return retval; + file = sock_alloc_file(sock, flags, NULL); + if (IS_ERR(file)) + sock_release(sock); + + return file; +} + +int __sys_socket(int family, int type, int protocol) +{ + struct socket *sock; + int flags; + + sock = __sys_socket_create(family, type, protocol); + if (IS_ERR(sock)) + return PTR_ERR(sock); + + flags = type & ~SOCK_TYPE_MASK; + if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) + flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 00bd0ecff5a1..f1876ea61fdc 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3744,7 +3744,7 @@ static int stale_bundle(struct dst_entry *dst) void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) { - dst->dev = dev_net(dev)->loopback_dev; + dst->dev = blackhole_netdev; dev_hold(dst->dev); dev_put(dev); } diff --git a/security/selinux/ss/hashtab.c b/security/selinux/ss/hashtab.c index 0ae4e4e57a40..3fb8f9026e9b 100644 --- a/security/selinux/ss/hashtab.c +++ b/security/selinux/ss/hashtab.c @@ -179,7 +179,8 @@ int hashtab_duplicate(struct hashtab *new, struct hashtab *orig, kmem_cache_free(hashtab_node_cachep, cur); } } - kmem_cache_free(hashtab_node_cachep, new); + kfree(new->htable); + memset(new, 0, sizeof(*new)); return -ENOMEM; } diff --git a/sound/isa/wavefront/wavefront_synth.c b/sound/isa/wavefront/wavefront_synth.c index 69cbc79fbb71..2aaaa6807174 100644 --- a/sound/isa/wavefront/wavefront_synth.c +++ b/sound/isa/wavefront/wavefront_synth.c @@ -1094,7 +1094,8 @@ wavefront_send_sample (snd_wavefront_t *dev, if (dataptr < data_end) { - __get_user (sample_short, dataptr); + if (get_user(sample_short, dataptr)) + return -EFAULT; dataptr += skip; if (data_is_unsigned) { /* GUS ? */ diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index cf531c1efa13..ad292df7d805 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -937,6 +937,9 @@ static int alc_init(struct hda_codec *codec) return 0; } +#define alc_free snd_hda_gen_free + +#ifdef CONFIG_PM static inline void alc_shutup(struct hda_codec *codec) { struct alc_spec *spec = codec->spec; @@ -950,9 +953,6 @@ static inline void alc_shutup(struct hda_codec *codec) alc_shutup_pins(codec); } -#define alc_free snd_hda_gen_free - -#ifdef CONFIG_PM static void alc_power_eapd(struct hda_codec *codec) { alc_auto_setup_eapd(codec, false); @@ -966,9 +966,7 @@ static int alc_suspend(struct hda_codec *codec) spec->power_hook(codec); return 0; } -#endif -#ifdef CONFIG_PM static int alc_resume(struct hda_codec *codec) { struct alc_spec *spec = codec->spec; @@ -6780,6 +6778,41 @@ static void alc256_fixup_mic_no_presence_and_resume(struct hda_codec *codec, } } +static void alc_fixup_dell4_mic_no_presence_quiet(struct hda_codec *codec, + const struct hda_fixup *fix, + int action) +{ + struct alc_spec *spec = codec->spec; + struct hda_input_mux *imux = &spec->gen.input_mux; + int i; + + alc269_fixup_limit_int_mic_boost(codec, fix, action); + + switch (action) { + case HDA_FIXUP_ACT_PRE_PROBE: + /** + * Set the vref of pin 0x19 (Headset Mic) and pin 0x1b (Headphone Mic) + * to Hi-Z to avoid pop noises at startup and when plugging and + * unplugging headphones. + */ + snd_hda_codec_set_pin_target(codec, 0x19, PIN_VREFHIZ); + snd_hda_codec_set_pin_target(codec, 0x1b, PIN_VREFHIZ); + break; + case HDA_FIXUP_ACT_PROBE: + /** + * Make the internal mic (0x12) the default input source to + * prevent pop noises on cold boot. + */ + for (i = 0; i < imux->num_items; i++) { + if (spec->gen.imux_pins[i] == 0x12) { + spec->gen.cur_mux[0] = i; + break; + } + } + break; + } +} + enum { ALC269_FIXUP_GPIO2, ALC269_FIXUP_SONY_VAIO, @@ -6821,6 +6854,7 @@ enum { ALC269_FIXUP_DELL2_MIC_NO_PRESENCE, ALC269_FIXUP_DELL3_MIC_NO_PRESENCE, ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, + ALC269_FIXUP_DELL4_MIC_NO_PRESENCE_QUIET, ALC269_FIXUP_HEADSET_MODE, ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC, ALC269_FIXUP_ASPIRE_HEADSET_MIC, @@ -7012,6 +7046,7 @@ enum { ALC245_FIXUP_CS35L41_SPI_4, ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED, ALC285_FIXUP_HP_SPEAKERS_MICMUTE_LED, + ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE, }; static const struct hda_fixup alc269_fixups[] = { @@ -8808,6 +8843,21 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC285_FIXUP_HP_MUTE_LED, }, + [ALC269_FIXUP_DELL4_MIC_NO_PRESENCE_QUIET] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc_fixup_dell4_mic_no_presence_quiet, + .chained = true, + .chain_id = ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, + }, + [ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x19, 0x02a1112c }, /* use as headset mic, without its own jack detect */ + { } + }, + .chained = true, + .chain_id = ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -8898,6 +8948,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x09bf, "Dell Precision", ALC233_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x0a2e, "Dell", ALC236_FIXUP_DELL_AIO_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0a30, "Dell", ALC236_FIXUP_DELL_AIO_HEADSET_MIC), + SND_PCI_QUIRK(0x1028, 0x0a38, "Dell Latitude 7520", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE_QUIET), SND_PCI_QUIRK(0x1028, 0x0a58, "Dell", ALC255_FIXUP_DELL_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0a61, "Dell XPS 15 9510", ALC289_FIXUP_DUAL_SPK), SND_PCI_QUIRK(0x1028, 0x0a62, "Dell Precision 5560", ALC289_FIXUP_DUAL_SPK), @@ -9040,6 +9091,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8995, "HP EliteBook 855 G9", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x89a4, "HP ProBook 440 G9", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89a6, "HP ProBook 450 G9", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x89aa, "HP EliteBook 630 G9", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89ac, "HP EliteBook 640 G9", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89ae, "HP EliteBook 650 G9", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89c3, "Zbook Studio G9", ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED), @@ -9290,6 +9342,14 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1c06, 0x2013, "Lemote A1802", ALC269_FIXUP_LEMOTE_A1802), SND_PCI_QUIRK(0x1c06, 0x2015, "Lemote A190X", ALC269_FIXUP_LEMOTE_A190X), SND_PCI_QUIRK(0x1d05, 0x1132, "TongFang PHxTxX1", ALC256_FIXUP_SET_COEF_DEFAULTS), + SND_PCI_QUIRK(0x1d05, 0x1096, "TongFang GMxMRxx", ALC269_FIXUP_NO_SHUTUP), + SND_PCI_QUIRK(0x1d05, 0x1100, "TongFang GKxNRxx", ALC269_FIXUP_NO_SHUTUP), + SND_PCI_QUIRK(0x1d05, 0x1111, "TongFang GMxZGxx", ALC269_FIXUP_NO_SHUTUP), + SND_PCI_QUIRK(0x1d05, 0x1119, "TongFang GMxZGxx", ALC269_FIXUP_NO_SHUTUP), + SND_PCI_QUIRK(0x1d05, 0x1129, "TongFang GMxZGxx", ALC269_FIXUP_NO_SHUTUP), + SND_PCI_QUIRK(0x1d05, 0x1147, "TongFang GMxTGxx", ALC269_FIXUP_NO_SHUTUP), + SND_PCI_QUIRK(0x1d05, 0x115c, "TongFang GMxTGxx", ALC269_FIXUP_NO_SHUTUP), + SND_PCI_QUIRK(0x1d05, 0x121b, "TongFang GMxAGxx", ALC269_FIXUP_NO_SHUTUP), SND_PCI_QUIRK(0x1d72, 0x1602, "RedmiBook", ALC255_FIXUP_XIAOMI_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1701, "XiaomiNotebook Pro", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1d72, 0x1901, "RedmiBook 14", ALC256_FIXUP_ASUS_HEADSET_MIC), @@ -9297,6 +9357,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x8086, 0x2074, "Intel NUC 8", ALC233_FIXUP_INTEL_NUC8_DMIC), SND_PCI_QUIRK(0x8086, 0x2080, "Intel NUC 8 Rugged", ALC256_FIXUP_INTEL_NUC8_RUGGED), SND_PCI_QUIRK(0x8086, 0x2081, "Intel NUC 10", ALC256_FIXUP_INTEL_NUC10), + SND_PCI_QUIRK(0xf111, 0x0001, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), #if 0 /* Below is a quirk table taken from the old code. diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index 0ea39565e623..40a5e3eb4ef2 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -3235,6 +3235,15 @@ YAMAHA_DEVICE(0x7010, "UB99"), } }, +/* Rane SL-1 */ +{ + USB_DEVICE(0x13e5, 0x0001), + .driver_info = (unsigned long) & (const struct snd_usb_audio_quirk) { + .ifnum = QUIRK_ANY_INTERFACE, + .type = QUIRK_AUDIO_STANDARD_INTERFACE + } +}, + /* disabled due to regression for other devices; * see https://bugzilla.kernel.org/show_bug.cgi?id=199905 */ diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index ab9f3da49941..fbbe59054c3f 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1822,6 +1822,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_IGNORE_CTL_ERROR), DEVICE_FLG(0x06f8, 0xd002, /* Hercules DJ Console (Macintosh Edition) */ QUIRK_FLAG_IGNORE_CTL_ERROR), + DEVICE_FLG(0x0711, 0x5800, /* MCT Trigger 5 USB-to-HDMI */ + QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x074d, 0x3553, /* Outlaw RR2150 (Micronas UAC3553B) */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x08bb, 0x2702, /* LineX FM Transmitter */ diff --git a/tools/Makefile b/tools/Makefile index db2f7b8ebed5..724134f0e56c 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -24,6 +24,7 @@ help: @echo ' intel-speed-select - Intel Speed Select tool' @echo ' kvm_stat - top-like utility for displaying kvm statistics' @echo ' leds - LEDs tools' + @echo ' nolibc - nolibc headers testing and installation' @echo ' objtool - an ELF object analysis tool' @echo ' pci - PCI tools' @echo ' perf - Linux performance measurement and analysis tool' @@ -74,6 +75,9 @@ bpf/%: FORCE libapi: FORCE $(call descend,lib/api) +nolibc_%: FORCE + $(call descend,include/nolibc,$(patsubst nolibc_%,%,$@)) + # The perf build does not follow the descend function setup, # invoking it via it's own make rule. PERF_O = $(if $(O),$(O)/tools/perf,) diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index ae61f464043a..c6a48d0ef9ff 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -98,6 +98,7 @@ FEATURE_TESTS_EXTRA := \ llvm-version \ clang \ libbpf \ + libbpf-btf__load_from_kernel_by_id \ libpfm4 \ libdebuginfod \ clang-bpf-co-re diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index de66e1cc0734..cb4a2a4fa2e4 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -57,6 +57,7 @@ FILES= \ test-lzma.bin \ test-bpf.bin \ test-libbpf.bin \ + test-libbpf-btf__load_from_kernel_by_id.bin \ test-get_cpuid.bin \ test-sdt.bin \ test-cxx.bin \ @@ -287,6 +288,9 @@ $(OUTPUT)test-bpf.bin: $(OUTPUT)test-libbpf.bin: $(BUILD) -lbpf +$(OUTPUT)test-libbpf-btf__load_from_kernel_by_id.bin: + $(BUILD) -lbpf + $(OUTPUT)test-sdt.bin: $(BUILD) diff --git a/tools/build/feature/test-libbpf-btf__load_from_kernel_by_id.c b/tools/build/feature/test-libbpf-btf__load_from_kernel_by_id.c new file mode 100644 index 000000000000..f7c084428735 --- /dev/null +++ b/tools/build/feature/test-libbpf-btf__load_from_kernel_by_id.c @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <bpf/libbpf.h> + +int main(void) +{ + return btf__load_from_kernel_by_id(20151128, NULL); +} diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile new file mode 100644 index 000000000000..7a16d917c185 --- /dev/null +++ b/tools/include/nolibc/Makefile @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for nolibc installation and tests +include ../../scripts/Makefile.include + +# we're in ".../tools/include/nolibc" +ifeq ($(srctree),) +srctree := $(patsubst %/tools/include/,%,$(dir $(CURDIR))) +endif + +nolibc_arch := $(patsubst arm64,aarch64,$(ARCH)) +arch_file := arch-$(nolibc_arch).h +all_files := ctype.h errno.h nolibc.h signal.h std.h stdio.h stdlib.h string.h \ + sys.h time.h types.h unistd.h + +# install all headers needed to support a bare-metal compiler +all: + +# Note: when ARCH is "x86" we concatenate both x86_64 and i386 +headers: + $(Q)mkdir -p $(OUTPUT)sysroot + $(Q)mkdir -p $(OUTPUT)sysroot/include + $(Q)cp $(all_files) $(OUTPUT)sysroot/include/ + $(Q)if [ "$(ARCH)" = "x86" ]; then \ + sed -e \ + 's,^#ifndef _NOLIBC_ARCH_X86_64_H,#if !defined(_NOLIBC_ARCH_X86_64_H) \&\& defined(__x86_64__),' \ + arch-x86_64.h; \ + sed -e \ + 's,^#ifndef _NOLIBC_ARCH_I386_H,#if !defined(_NOLIBC_ARCH_I386_H) \&\& !defined(__x86_64__),' \ + arch-i386.h; \ + elif [ -e "$(arch_file)" ]; then \ + cat $(arch_file); \ + else \ + echo "Fatal: architecture $(ARCH) not yet supported by nolibc." >&2; \ + exit 1; \ + fi > $(OUTPUT)sysroot/include/arch.h + +headers_standalone: headers + $(Q)$(MAKE) -C $(srctree) headers + $(Q)$(MAKE) -C $(srctree) headers_install INSTALL_HDR_PATH=$(OUTPUT)/sysroot + +clean: + $(call QUIET_CLEAN, nolibc) rm -rf "$(OUTPUT)sysroot" diff --git a/tools/include/nolibc/arch-aarch64.h b/tools/include/nolibc/arch-aarch64.h new file mode 100644 index 000000000000..f68baf8f395f --- /dev/null +++ b/tools/include/nolibc/arch-aarch64.h @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * AARCH64 specific definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_ARCH_AARCH64_H +#define _NOLIBC_ARCH_AARCH64_H + +/* O_* macros for fcntl/open are architecture-specific */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x4000 + +/* The struct returned by the newfstatat() syscall. Differs slightly from the + * x86_64's stat one by field ordering, so be careful. + */ +struct sys_stat_struct { + unsigned long st_dev; + unsigned long st_ino; + unsigned int st_mode; + unsigned int st_nlink; + unsigned int st_uid; + unsigned int st_gid; + + unsigned long st_rdev; + unsigned long __pad1; + long st_size; + int st_blksize; + int __pad2; + + long st_blocks; + long st_atime; + unsigned long st_atime_nsec; + long st_mtime; + + unsigned long st_mtime_nsec; + long st_ctime; + unsigned long st_ctime_nsec; + unsigned int __unused[2]; +}; + +/* Syscalls for AARCH64 : + * - registers are 64-bit + * - stack is 16-byte aligned + * - syscall number is passed in x8 + * - arguments are in x0, x1, x2, x3, x4, x5 + * - the system call is performed by calling svc 0 + * - syscall return comes in x0. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * + * On aarch64, select() is not implemented so we have to use pselect6(). + */ +#define __ARCH_WANT_SYS_PSELECT6 + +#define my_syscall0(num) \ +({ \ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0"); \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0") = (long)(arg1); \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0") = (long)(arg1); \ + register long _arg2 __asm__ ("x1") = (long)(arg2); \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0") = (long)(arg1); \ + register long _arg2 __asm__ ("x1") = (long)(arg2); \ + register long _arg3 __asm__ ("x2") = (long)(arg3); \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0") = (long)(arg1); \ + register long _arg2 __asm__ ("x1") = (long)(arg2); \ + register long _arg3 __asm__ ("x2") = (long)(arg3); \ + register long _arg4 __asm__ ("x3") = (long)(arg4); \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0") = (long)(arg1); \ + register long _arg2 __asm__ ("x1") = (long)(arg2); \ + register long _arg3 __asm__ ("x2") = (long)(arg3); \ + register long _arg4 __asm__ ("x3") = (long)(arg4); \ + register long _arg5 __asm__ ("x4") = (long)(arg5); \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r" (_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ +({ \ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0") = (long)(arg1); \ + register long _arg2 __asm__ ("x1") = (long)(arg2); \ + register long _arg3 __asm__ ("x2") = (long)(arg3); \ + register long _arg4 __asm__ ("x3") = (long)(arg4); \ + register long _arg5 __asm__ ("x4") = (long)(arg5); \ + register long _arg6 __asm__ ("x5") = (long)(arg6); \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r" (_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_arg6), "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +/* startup code */ +__asm__ (".section .text\n" + ".weak _start\n" + "_start:\n" + "ldr x0, [sp]\n" // argc (x0) was in the stack + "add x1, sp, 8\n" // argv (x1) = sp + "lsl x2, x0, 3\n" // envp (x2) = 8*argc ... + "add x2, x2, 8\n" // + 8 (skip null) + "add x2, x2, x1\n" // + argv + "and sp, x1, -16\n" // sp must be 16-byte aligned in the callee + "bl main\n" // main() returns the status code, we'll exit with it. + "mov x8, 93\n" // NR_exit == 93 + "svc #0\n" + ""); + +#endif // _NOLIBC_ARCH_AARCH64_H diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h new file mode 100644 index 000000000000..f31be8e967d6 --- /dev/null +++ b/tools/include/nolibc/arch-arm.h @@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * ARM specific definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_ARCH_ARM_H +#define _NOLIBC_ARCH_ARM_H + +/* O_* macros for fcntl/open are architecture-specific */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x4000 + +/* The struct returned by the stat() syscall, 32-bit only, the syscall returns + * exactly 56 bytes (stops before the unused array). In big endian, the format + * differs as devices are returned as short only. + */ +struct sys_stat_struct { +#if defined(__ARMEB__) + unsigned short st_dev; + unsigned short __pad1; +#else + unsigned long st_dev; +#endif + unsigned long st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; + +#if defined(__ARMEB__) + unsigned short st_rdev; + unsigned short __pad2; +#else + unsigned long st_rdev; +#endif + unsigned long st_size; + unsigned long st_blksize; + unsigned long st_blocks; + + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + unsigned long st_mtime_nsec; + + unsigned long st_ctime; + unsigned long st_ctime_nsec; + unsigned long __unused[2]; +}; + +/* Syscalls for ARM in ARM or Thumb modes : + * - registers are 32-bit + * - stack is 8-byte aligned + * ( http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka4127.html) + * - syscall number is passed in r7 + * - arguments are in r0, r1, r2, r3, r4, r5 + * - the system call is performed by calling svc #0 + * - syscall return comes in r0. + * - only lr is clobbered. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * - the syscall number is always specified last in order to allow to force + * some registers before (gcc refuses a %-register at the last position). + * + * Also, ARM supports the old_select syscall if newselect is not available + */ +#define __ARCH_WANT_SYS_OLD_SELECT + +#define my_syscall0(num) \ +({ \ + register long _num __asm__ ("r7") = (num); \ + register long _arg1 __asm__ ("r0"); \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num __asm__ ("r7") = (num); \ + register long _arg1 __asm__ ("r0") = (long)(arg1); \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num __asm__ ("r7") = (num); \ + register long _arg1 __asm__ ("r0") = (long)(arg1); \ + register long _arg2 __asm__ ("r1") = (long)(arg2); \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num __asm__ ("r7") = (num); \ + register long _arg1 __asm__ ("r0") = (long)(arg1); \ + register long _arg2 __asm__ ("r1") = (long)(arg2); \ + register long _arg3 __asm__ ("r2") = (long)(arg3); \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num __asm__ ("r7") = (num); \ + register long _arg1 __asm__ ("r0") = (long)(arg1); \ + register long _arg2 __asm__ ("r1") = (long)(arg2); \ + register long _arg3 __asm__ ("r2") = (long)(arg3); \ + register long _arg4 __asm__ ("r3") = (long)(arg4); \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num __asm__ ("r7") = (num); \ + register long _arg1 __asm__ ("r0") = (long)(arg1); \ + register long _arg2 __asm__ ("r1") = (long)(arg2); \ + register long _arg3 __asm__ ("r2") = (long)(arg3); \ + register long _arg4 __asm__ ("r3") = (long)(arg4); \ + register long _arg5 __asm__ ("r4") = (long)(arg5); \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r" (_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +/* startup code */ +__asm__ (".section .text\n" + ".weak _start\n" + "_start:\n" +#if defined(__THUMBEB__) || defined(__THUMBEL__) + /* We enter here in 32-bit mode but if some previous functions were in + * 16-bit mode, the assembler cannot know, so we need to tell it we're in + * 32-bit now, then switch to 16-bit (is there a better way to do it than + * adding 1 by hand ?) and tell the asm we're now in 16-bit mode so that + * it generates correct instructions. Note that we do not support thumb1. + */ + ".code 32\n" + "add r0, pc, #1\n" + "bx r0\n" + ".code 16\n" +#endif + "pop {%r0}\n" // argc was in the stack + "mov %r1, %sp\n" // argv = sp + "add %r2, %r1, %r0, lsl #2\n" // envp = argv + 4*argc ... + "add %r2, %r2, $4\n" // ... + 4 + "and %r3, %r1, $-8\n" // AAPCS : sp must be 8-byte aligned in the + "mov %sp, %r3\n" // callee, an bl doesn't push (lr=pc) + "bl main\n" // main() returns the status code, we'll exit with it. + "movs r7, $1\n" // NR_exit == 1 + "svc $0x00\n" + ""); + +#endif // _NOLIBC_ARCH_ARM_H diff --git a/tools/include/nolibc/arch-i386.h b/tools/include/nolibc/arch-i386.h new file mode 100644 index 000000000000..d7e7212346e2 --- /dev/null +++ b/tools/include/nolibc/arch-i386.h @@ -0,0 +1,219 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * i386 specific definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_ARCH_I386_H +#define _NOLIBC_ARCH_I386_H + +/* O_* macros for fcntl/open are architecture-specific */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x10000 + +/* The struct returned by the stat() syscall, 32-bit only, the syscall returns + * exactly 56 bytes (stops before the unused array). + */ +struct sys_stat_struct { + unsigned long st_dev; + unsigned long st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; + + unsigned long st_rdev; + unsigned long st_size; + unsigned long st_blksize; + unsigned long st_blocks; + + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + unsigned long st_mtime_nsec; + + unsigned long st_ctime; + unsigned long st_ctime_nsec; + unsigned long __unused[2]; +}; + +/* Syscalls for i386 : + * - mostly similar to x86_64 + * - registers are 32-bit + * - syscall number is passed in eax + * - arguments are in ebx, ecx, edx, esi, edi, ebp respectively + * - all registers are preserved (except eax of course) + * - the system call is performed by calling int $0x80 + * - syscall return comes in eax + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * - the syscall number is always specified last in order to allow to force + * some registers before (gcc refuses a %-register at the last position). + * + * Also, i386 supports the old_select syscall if newselect is not available + */ +#define __ARCH_WANT_SYS_OLD_SELECT + +#define my_syscall0(num) \ +({ \ + long _ret; \ + register long _num __asm__ ("eax") = (num); \ + \ + __asm__ volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + long _ret; \ + register long _num __asm__ ("eax") = (num); \ + register long _arg1 __asm__ ("ebx") = (long)(arg1); \ + \ + __asm__ volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + long _ret; \ + register long _num __asm__ ("eax") = (num); \ + register long _arg1 __asm__ ("ebx") = (long)(arg1); \ + register long _arg2 __asm__ ("ecx") = (long)(arg2); \ + \ + __asm__ volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + long _ret; \ + register long _num __asm__ ("eax") = (num); \ + register long _arg1 __asm__ ("ebx") = (long)(arg1); \ + register long _arg2 __asm__ ("ecx") = (long)(arg2); \ + register long _arg3 __asm__ ("edx") = (long)(arg3); \ + \ + __asm__ volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + long _ret; \ + register long _num __asm__ ("eax") = (num); \ + register long _arg1 __asm__ ("ebx") = (long)(arg1); \ + register long _arg2 __asm__ ("ecx") = (long)(arg2); \ + register long _arg3 __asm__ ("edx") = (long)(arg3); \ + register long _arg4 __asm__ ("esi") = (long)(arg4); \ + \ + __asm__ volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + long _ret; \ + register long _num __asm__ ("eax") = (num); \ + register long _arg1 __asm__ ("ebx") = (long)(arg1); \ + register long _arg2 __asm__ ("ecx") = (long)(arg2); \ + register long _arg3 __asm__ ("edx") = (long)(arg3); \ + register long _arg4 __asm__ ("esi") = (long)(arg4); \ + register long _arg5 __asm__ ("edi") = (long)(arg5); \ + \ + __asm__ volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ +({ \ + long _eax = (long)(num); \ + long _arg6 = (long)(arg6); /* Always in memory */ \ + __asm__ volatile ( \ + "pushl %[_arg6]\n\t" \ + "pushl %%ebp\n\t" \ + "movl 4(%%esp),%%ebp\n\t" \ + "int $0x80\n\t" \ + "popl %%ebp\n\t" \ + "addl $4,%%esp\n\t" \ + : "+a"(_eax) /* %eax */ \ + : "b"(arg1), /* %ebx */ \ + "c"(arg2), /* %ecx */ \ + "d"(arg3), /* %edx */ \ + "S"(arg4), /* %esi */ \ + "D"(arg5), /* %edi */ \ + [_arg6]"m"(_arg6) /* memory */ \ + : "memory", "cc" \ + ); \ + _eax; \ +}) + +/* startup code */ +/* + * i386 System V ABI mandates: + * 1) last pushed argument must be 16-byte aligned. + * 2) The deepest stack frame should be set to zero + * + */ +__asm__ (".section .text\n" + ".weak _start\n" + "_start:\n" + "pop %eax\n" // argc (first arg, %eax) + "mov %esp, %ebx\n" // argv[] (second arg, %ebx) + "lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx) + "xor %ebp, %ebp\n" // zero the stack frame + "and $-16, %esp\n" // x86 ABI : esp must be 16-byte aligned before + "sub $4, %esp\n" // the call instruction (args are aligned) + "push %ecx\n" // push all registers on the stack so that we + "push %ebx\n" // support both regparm and plain stack modes + "push %eax\n" + "call main\n" // main() returns the status code in %eax + "mov %eax, %ebx\n" // retrieve exit code (32-bit int) + "movl $1, %eax\n" // NR_exit == 1 + "int $0x80\n" // exit now + "hlt\n" // ensure it does not + ""); + +#endif // _NOLIBC_ARCH_I386_H diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h new file mode 100644 index 000000000000..5fc5b8029bff --- /dev/null +++ b/tools/include/nolibc/arch-mips.h @@ -0,0 +1,215 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * MIPS specific definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_ARCH_MIPS_H +#define _NOLIBC_ARCH_MIPS_H + +/* O_* macros for fcntl/open are architecture-specific */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_APPEND 0x0008 +#define O_NONBLOCK 0x0080 +#define O_CREAT 0x0100 +#define O_TRUNC 0x0200 +#define O_EXCL 0x0400 +#define O_NOCTTY 0x0800 +#define O_DIRECTORY 0x10000 + +/* The struct returned by the stat() syscall. 88 bytes are returned by the + * syscall. + */ +struct sys_stat_struct { + unsigned int st_dev; + long st_pad1[3]; + unsigned long st_ino; + unsigned int st_mode; + unsigned int st_nlink; + unsigned int st_uid; + unsigned int st_gid; + unsigned int st_rdev; + long st_pad2[2]; + long st_size; + long st_pad3; + + long st_atime; + long st_atime_nsec; + long st_mtime; + long st_mtime_nsec; + + long st_ctime; + long st_ctime_nsec; + long st_blksize; + long st_blocks; + long st_pad4[14]; +}; + +/* Syscalls for MIPS ABI O32 : + * - WARNING! there's always a delayed slot! + * - WARNING again, the syntax is different, registers take a '$' and numbers + * do not. + * - registers are 32-bit + * - stack is 8-byte aligned + * - syscall number is passed in v0 (starts at 0xfa0). + * - arguments are in a0, a1, a2, a3, then the stack. The caller needs to + * leave some room in the stack for the callee to save a0..a3 if needed. + * - Many registers are clobbered, in fact only a0..a2 and s0..s8 are + * preserved. See: https://www.linux-mips.org/wiki/Syscall as well as + * scall32-o32.S in the kernel sources. + * - the system call is performed by calling "syscall" + * - syscall return comes in v0, and register a3 needs to be checked to know + * if an error occurred, in which case errno is in v0. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + */ + +#define my_syscall0(num) \ +({ \ + register long _num __asm__ ("v0") = (num); \ + register long _arg4 __asm__ ("a3"); \ + \ + __asm__ volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "r"(_num) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num __asm__ ("v0") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg4 __asm__ ("a3"); \ + \ + __asm__ volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num __asm__ ("v0") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg4 __asm__ ("a3"); \ + \ + __asm__ volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num __asm__ ("v0") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3"); \ + \ + __asm__ volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2), "r"(_arg3) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num __asm__ ("v0") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ + \ + __asm__ volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r" (_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num __asm__ ("v0") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ + register long _arg5 = (long)(arg5); \ + \ + __asm__ volatile ( \ + "addiu $sp, $sp, -32\n" \ + "sw %7, 16($sp)\n" \ + "syscall\n " \ + "addiu $sp, $sp, 32\n" \ + : "=r" (_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +/* startup code, note that it's called __start on MIPS */ +__asm__ (".section .text\n" + ".weak __start\n" + ".set nomips16\n" + ".set noreorder\n" + ".option pic0\n" + ".ent __start\n" + "__start:\n" + "lw $a0,($sp)\n" // argc was in the stack + "addiu $a1, $sp, 4\n" // argv = sp + 4 + "sll $a2, $a0, 2\n" // a2 = argc * 4 + "add $a2, $a2, $a1\n" // envp = argv + 4*argc ... + "addiu $a2, $a2, 4\n" // ... + 4 + "li $t0, -8\n" + "and $sp, $sp, $t0\n" // sp must be 8-byte aligned + "addiu $sp,$sp,-16\n" // the callee expects to save a0..a3 there! + "jal main\n" // main() returns the status code, we'll exit with it. + "nop\n" // delayed slot + "move $a0, $v0\n" // retrieve 32-bit exit code from v0 + "li $v0, 4001\n" // NR_exit == 4001 + "syscall\n" + ".end __start\n" + ""); + +#endif // _NOLIBC_ARCH_MIPS_H diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h new file mode 100644 index 000000000000..95e2b7924925 --- /dev/null +++ b/tools/include/nolibc/arch-riscv.h @@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * RISCV (32 and 64) specific definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_ARCH_RISCV_H +#define _NOLIBC_ARCH_RISCV_H + +/* O_* macros for fcntl/open are architecture-specific */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x100 +#define O_EXCL 0x200 +#define O_NOCTTY 0x400 +#define O_TRUNC 0x1000 +#define O_APPEND 0x2000 +#define O_NONBLOCK 0x4000 +#define O_DIRECTORY 0x200000 + +struct sys_stat_struct { + unsigned long st_dev; /* Device. */ + unsigned long st_ino; /* File serial number. */ + unsigned int st_mode; /* File mode. */ + unsigned int st_nlink; /* Link count. */ + unsigned int st_uid; /* User ID of the file's owner. */ + unsigned int st_gid; /* Group ID of the file's group. */ + unsigned long st_rdev; /* Device number, if device. */ + unsigned long __pad1; + long st_size; /* Size of file, in bytes. */ + int st_blksize; /* Optimal block size for I/O. */ + int __pad2; + long st_blocks; /* Number 512-byte blocks allocated. */ + long st_atime; /* Time of last access. */ + unsigned long st_atime_nsec; + long st_mtime; /* Time of last modification. */ + unsigned long st_mtime_nsec; + long st_ctime; /* Time of last status change. */ + unsigned long st_ctime_nsec; + unsigned int __unused4; + unsigned int __unused5; +}; + +#if __riscv_xlen == 64 +#define PTRLOG "3" +#define SZREG "8" +#elif __riscv_xlen == 32 +#define PTRLOG "2" +#define SZREG "4" +#endif + +/* Syscalls for RISCV : + * - stack is 16-byte aligned + * - syscall number is passed in a7 + * - arguments are in a0, a1, a2, a3, a4, a5 + * - the system call is performed by calling ecall + * - syscall return comes in a0 + * - the arguments are cast to long and assigned into the target + * registers which are then simply passed as registers to the asm code, + * so that we don't have to experience issues with register constraints. + * + * On riscv, select() is not implemented so we have to use pselect6(). + */ +#define __ARCH_WANT_SYS_PSELECT6 + +#define my_syscall0(num) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0"); \ + \ + __asm__ volatile ( \ + "ecall\n\t" \ + : "=r"(_arg1) \ + : "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + \ + __asm__ volatile ( \ + "ecall\n" \ + : "+r"(_arg1) \ + : "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + \ + __asm__ volatile ( \ + "ecall\n" \ + : "+r"(_arg1) \ + : "r"(_arg2), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + \ + __asm__ volatile ( \ + "ecall\n\t" \ + : "+r"(_arg1) \ + : "r"(_arg2), "r"(_arg3), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ + \ + __asm__ volatile ( \ + "ecall\n" \ + : "+r"(_arg1) \ + : "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ + register long _arg5 __asm__ ("a4") = (long)(arg5); \ + \ + __asm__ volatile ( \ + "ecall\n" \ + : "+r"(_arg1) \ + : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ + register long _arg5 __asm__ ("a4") = (long)(arg5); \ + register long _arg6 __asm__ ("a5") = (long)(arg6); \ + \ + __asm__ volatile ( \ + "ecall\n" \ + : "+r"(_arg1) \ + : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), "r"(_arg6), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +/* startup code */ +__asm__ (".section .text\n" + ".weak _start\n" + "_start:\n" + ".option push\n" + ".option norelax\n" + "lla gp, __global_pointer$\n" + ".option pop\n" + "ld a0, 0(sp)\n" // argc (a0) was in the stack + "add a1, sp, "SZREG"\n" // argv (a1) = sp + "slli a2, a0, "PTRLOG"\n" // envp (a2) = SZREG*argc ... + "add a2, a2, "SZREG"\n" // + SZREG (skip null) + "add a2,a2,a1\n" // + argv + "andi sp,a1,-16\n" // sp must be 16-byte aligned + "call main\n" // main() returns the status code, we'll exit with it. + "li a7, 93\n" // NR_exit == 93 + "ecall\n" + ""); + +#endif // _NOLIBC_ARCH_RISCV_H diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h new file mode 100644 index 000000000000..0e1e9eb8545d --- /dev/null +++ b/tools/include/nolibc/arch-x86_64.h @@ -0,0 +1,215 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * x86_64 specific definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_ARCH_X86_64_H +#define _NOLIBC_ARCH_X86_64_H + +/* O_* macros for fcntl/open are architecture-specific */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x10000 + +/* The struct returned by the stat() syscall, equivalent to stat64(). The + * syscall returns 116 bytes and stops in the middle of __unused. + */ +struct sys_stat_struct { + unsigned long st_dev; + unsigned long st_ino; + unsigned long st_nlink; + unsigned int st_mode; + unsigned int st_uid; + + unsigned int st_gid; + unsigned int __pad0; + unsigned long st_rdev; + long st_size; + long st_blksize; + + long st_blocks; + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + + unsigned long st_mtime_nsec; + unsigned long st_ctime; + unsigned long st_ctime_nsec; + long __unused[3]; +}; + +/* Syscalls for x86_64 : + * - registers are 64-bit + * - syscall number is passed in rax + * - arguments are in rdi, rsi, rdx, r10, r8, r9 respectively + * - the system call is performed by calling the syscall instruction + * - syscall return comes in rax + * - rcx and r11 are clobbered, others are preserved. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * - the syscall number is always specified last in order to allow to force + * some registers before (gcc refuses a %-register at the last position). + * - see also x86-64 ABI section A.2 AMD64 Linux Kernel Conventions, A.2.1 + * Calling Conventions. + * + * Link x86-64 ABI: https://gitlab.com/x86-psABIs/x86-64-ABI/-/wikis/home + * + */ + +#define my_syscall0(num) \ +({ \ + long _ret; \ + register long _num __asm__ ("rax") = (num); \ + \ + __asm__ volatile ( \ + "syscall\n" \ + : "=a"(_ret) \ + : "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + long _ret; \ + register long _num __asm__ ("rax") = (num); \ + register long _arg1 __asm__ ("rdi") = (long)(arg1); \ + \ + __asm__ volatile ( \ + "syscall\n" \ + : "=a"(_ret) \ + : "r"(_arg1), \ + "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + long _ret; \ + register long _num __asm__ ("rax") = (num); \ + register long _arg1 __asm__ ("rdi") = (long)(arg1); \ + register long _arg2 __asm__ ("rsi") = (long)(arg2); \ + \ + __asm__ volatile ( \ + "syscall\n" \ + : "=a"(_ret) \ + : "r"(_arg1), "r"(_arg2), \ + "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + long _ret; \ + register long _num __asm__ ("rax") = (num); \ + register long _arg1 __asm__ ("rdi") = (long)(arg1); \ + register long _arg2 __asm__ ("rsi") = (long)(arg2); \ + register long _arg3 __asm__ ("rdx") = (long)(arg3); \ + \ + __asm__ volatile ( \ + "syscall\n" \ + : "=a"(_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + long _ret; \ + register long _num __asm__ ("rax") = (num); \ + register long _arg1 __asm__ ("rdi") = (long)(arg1); \ + register long _arg2 __asm__ ("rsi") = (long)(arg2); \ + register long _arg3 __asm__ ("rdx") = (long)(arg3); \ + register long _arg4 __asm__ ("r10") = (long)(arg4); \ + \ + __asm__ volatile ( \ + "syscall\n" \ + : "=a"(_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + long _ret; \ + register long _num __asm__ ("rax") = (num); \ + register long _arg1 __asm__ ("rdi") = (long)(arg1); \ + register long _arg2 __asm__ ("rsi") = (long)(arg2); \ + register long _arg3 __asm__ ("rdx") = (long)(arg3); \ + register long _arg4 __asm__ ("r10") = (long)(arg4); \ + register long _arg5 __asm__ ("r8") = (long)(arg5); \ + \ + __asm__ volatile ( \ + "syscall\n" \ + : "=a"(_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ +({ \ + long _ret; \ + register long _num __asm__ ("rax") = (num); \ + register long _arg1 __asm__ ("rdi") = (long)(arg1); \ + register long _arg2 __asm__ ("rsi") = (long)(arg2); \ + register long _arg3 __asm__ ("rdx") = (long)(arg3); \ + register long _arg4 __asm__ ("r10") = (long)(arg4); \ + register long _arg5 __asm__ ("r8") = (long)(arg5); \ + register long _arg6 __asm__ ("r9") = (long)(arg6); \ + \ + __asm__ volatile ( \ + "syscall\n" \ + : "=a"(_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_arg6), "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +/* startup code */ +/* + * x86-64 System V ABI mandates: + * 1) %rsp must be 16-byte aligned right before the function call. + * 2) The deepest stack frame should be zero (the %rbp). + * + */ +__asm__ (".section .text\n" + ".weak _start\n" + "_start:\n" + "pop %rdi\n" // argc (first arg, %rdi) + "mov %rsp, %rsi\n" // argv[] (second arg, %rsi) + "lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx) + "xor %ebp, %ebp\n" // zero the stack frame + "and $-16, %rsp\n" // x86 ABI : esp must be 16-byte aligned before call + "call main\n" // main() returns the status code, we'll exit with it. + "mov %eax, %edi\n" // retrieve exit code (32 bit) + "mov $60, %eax\n" // NR_exit == 60 + "syscall\n" // really exit + "hlt\n" // ensure it does not return + ""); + +#endif // _NOLIBC_ARCH_X86_64_H diff --git a/tools/include/nolibc/arch.h b/tools/include/nolibc/arch.h new file mode 100644 index 000000000000..4c6992321b0d --- /dev/null +++ b/tools/include/nolibc/arch.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Copyright (C) 2017-2022 Willy Tarreau <w@1wt.eu> + */ + +/* Below comes the architecture-specific code. For each architecture, we have + * the syscall declarations and the _start code definition. This is the only + * global part. On all architectures the kernel puts everything in the stack + * before jumping to _start just above us, without any return address (_start + * is not a function but an entry pint). So at the stack pointer we find argc. + * Then argv[] begins, and ends at the first NULL. Then we have envp which + * starts and ends with a NULL as well. So envp=argv+argc+1. + */ + +#ifndef _NOLIBC_ARCH_H +#define _NOLIBC_ARCH_H + +#if defined(__x86_64__) +#include "arch-x86_64.h" +#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) +#include "arch-i386.h" +#elif defined(__ARM_EABI__) +#include "arch-arm.h" +#elif defined(__aarch64__) +#include "arch-aarch64.h" +#elif defined(__mips__) && defined(_ABIO32) +#include "arch-mips.h" +#elif defined(__riscv) +#include "arch-riscv.h" +#endif + +#endif /* _NOLIBC_ARCH_H */ diff --git a/tools/include/nolibc/ctype.h b/tools/include/nolibc/ctype.h new file mode 100644 index 000000000000..e3000b2992d7 --- /dev/null +++ b/tools/include/nolibc/ctype.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * ctype function definitions for NOLIBC + * Copyright (C) 2017-2021 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_CTYPE_H +#define _NOLIBC_CTYPE_H + +#include "std.h" + +/* + * As much as possible, please keep functions alphabetically sorted. + */ + +static __attribute__((unused)) +int isascii(int c) +{ + /* 0x00..0x7f */ + return (unsigned int)c <= 0x7f; +} + +static __attribute__((unused)) +int isblank(int c) +{ + return c == '\t' || c == ' '; +} + +static __attribute__((unused)) +int iscntrl(int c) +{ + /* 0x00..0x1f, 0x7f */ + return (unsigned int)c < 0x20 || c == 0x7f; +} + +static __attribute__((unused)) +int isdigit(int c) +{ + return (unsigned int)(c - '0') < 10; +} + +static __attribute__((unused)) +int isgraph(int c) +{ + /* 0x21..0x7e */ + return (unsigned int)(c - 0x21) < 0x5e; +} + +static __attribute__((unused)) +int islower(int c) +{ + return (unsigned int)(c - 'a') < 26; +} + +static __attribute__((unused)) +int isprint(int c) +{ + /* 0x20..0x7e */ + return (unsigned int)(c - 0x20) < 0x5f; +} + +static __attribute__((unused)) +int isspace(int c) +{ + /* \t is 0x9, \n is 0xA, \v is 0xB, \f is 0xC, \r is 0xD */ + return ((unsigned int)c == ' ') || (unsigned int)(c - 0x09) < 5; +} + +static __attribute__((unused)) +int isupper(int c) +{ + return (unsigned int)(c - 'A') < 26; +} + +static __attribute__((unused)) +int isxdigit(int c) +{ + return isdigit(c) || (unsigned int)(c - 'A') < 6 || (unsigned int)(c - 'a') < 6; +} + +static __attribute__((unused)) +int isalpha(int c) +{ + return islower(c) || isupper(c); +} + +static __attribute__((unused)) +int isalnum(int c) +{ + return isalpha(c) || isdigit(c); +} + +static __attribute__((unused)) +int ispunct(int c) +{ + return isgraph(c) && !isalnum(c); +} + +#endif /* _NOLIBC_CTYPE_H */ diff --git a/tools/include/nolibc/errno.h b/tools/include/nolibc/errno.h new file mode 100644 index 000000000000..06893d6dfb7a --- /dev/null +++ b/tools/include/nolibc/errno.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Minimal errno definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_ERRNO_H +#define _NOLIBC_ERRNO_H + +#include <asm/errno.h> + +/* this way it will be removed if unused */ +static int errno; + +#ifndef NOLIBC_IGNORE_ERRNO +#define SET_ERRNO(v) do { errno = (v); } while (0) +#else +#define SET_ERRNO(v) do { } while (0) +#endif + + +/* errno codes all ensure that they will not conflict with a valid pointer + * because they all correspond to the highest addressable memory page. + */ +#define MAX_ERRNO 4095 + +#endif /* _NOLIBC_ERRNO_H */ diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index c1c285fe494a..b2bc48d3cfe4 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -57,22 +57,32 @@ * having to specify anything. * * Finally some very common libc-level functions are provided. It is the case - * for a few functions usually found in string.h, ctype.h, or stdlib.h. Nothing - * is currently provided regarding stdio emulation. + * for a few functions usually found in string.h, ctype.h, or stdlib.h. * - * The macro NOLIBC is always defined, so that it is possible for a program to - * check this macro to know if it is being built against and decide to disable - * some features or simply not to include some standard libc files. - * - * Ideally this file should be split in multiple files for easier long term - * maintenance, but provided as a single file as it is now, it's quite - * convenient to use. Maybe some variations involving a set of includes at the - * top could work. + * The nolibc.h file is only a convenient entry point which includes all other + * files. It also defines the NOLIBC macro, so that it is possible for a + * program to check this macro to know if it is being built against and decide + * to disable some features or simply not to include some standard libc files. * * A simple static executable may be built this way : * $ gcc -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ * -static -include nolibc.h -o hello hello.c -lgcc * + * Simple programs meant to be reasonably portable to various libc and using + * only a few common includes, may also be built by simply making the include + * path point to the nolibc directory: + * $ gcc -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ + * -I../nolibc -o hello hello.c -lgcc + * + * The available standard (but limited) include files are: + * ctype.h, errno.h, signal.h, stdio.h, stdlib.h, string.h, time.h + * + * In addition, the following ones are expected to be provided by the compiler: + * float.h, stdarg.h, stddef.h + * + * The following ones which are part to the C standard are not provided: + * assert.h, locale.h, math.h, setjmp.h, limits.h + * * A very useful calling convention table may be found here : * http://man7.org/linux/man-pages/man2/syscall.2.html * @@ -80,2502 +90,22 @@ * https://w3challs.com/syscalls/ * */ +#ifndef _NOLIBC_H +#define _NOLIBC_H -#include <asm/unistd.h> -#include <asm/ioctls.h> -#include <asm/errno.h> -#include <linux/fs.h> -#include <linux/loop.h> -#include <linux/time.h> +#include "std.h" +#include "arch.h" +#include "types.h" +#include "sys.h" +#include "ctype.h" +#include "signal.h" +#include "stdio.h" +#include "stdlib.h" +#include "string.h" +#include "time.h" +#include "unistd.h" +/* Used by programs to avoid std includes */ #define NOLIBC -/* this way it will be removed if unused */ -static int errno; - -#ifndef NOLIBC_IGNORE_ERRNO -#define SET_ERRNO(v) do { errno = (v); } while (0) -#else -#define SET_ERRNO(v) do { } while (0) -#endif - -/* errno codes all ensure that they will not conflict with a valid pointer - * because they all correspond to the highest addressable memory page. - */ -#define MAX_ERRNO 4095 - -/* Declare a few quite common macros and types that usually are in stdlib.h, - * stdint.h, ctype.h, unistd.h and a few other common locations. - */ - -#define NULL ((void *)0) - -/* stdint types */ -typedef unsigned char uint8_t; -typedef signed char int8_t; -typedef unsigned short uint16_t; -typedef signed short int16_t; -typedef unsigned int uint32_t; -typedef signed int int32_t; -typedef unsigned long long uint64_t; -typedef signed long long int64_t; -typedef unsigned long size_t; -typedef signed long ssize_t; -typedef unsigned long uintptr_t; -typedef signed long intptr_t; -typedef signed long ptrdiff_t; - -/* for stat() */ -typedef unsigned int dev_t; -typedef unsigned long ino_t; -typedef unsigned int mode_t; -typedef signed int pid_t; -typedef unsigned int uid_t; -typedef unsigned int gid_t; -typedef unsigned long nlink_t; -typedef signed long off_t; -typedef signed long blksize_t; -typedef signed long blkcnt_t; -typedef signed long time_t; - -/* for poll() */ -struct pollfd { - int fd; - short int events; - short int revents; -}; - -/* for getdents64() */ -struct linux_dirent64 { - uint64_t d_ino; - int64_t d_off; - unsigned short d_reclen; - unsigned char d_type; - char d_name[]; -}; - -/* commonly an fd_set represents 256 FDs */ -#define FD_SETSIZE 256 -typedef struct { uint32_t fd32[FD_SETSIZE/32]; } fd_set; - -/* needed by wait4() */ -struct rusage { - struct timeval ru_utime; - struct timeval ru_stime; - long ru_maxrss; - long ru_ixrss; - long ru_idrss; - long ru_isrss; - long ru_minflt; - long ru_majflt; - long ru_nswap; - long ru_inblock; - long ru_oublock; - long ru_msgsnd; - long ru_msgrcv; - long ru_nsignals; - long ru_nvcsw; - long ru_nivcsw; -}; - -/* stat flags (WARNING, octal here) */ -#define S_IFDIR 0040000 -#define S_IFCHR 0020000 -#define S_IFBLK 0060000 -#define S_IFREG 0100000 -#define S_IFIFO 0010000 -#define S_IFLNK 0120000 -#define S_IFSOCK 0140000 -#define S_IFMT 0170000 - -#define S_ISDIR(mode) (((mode) & S_IFDIR) == S_IFDIR) -#define S_ISCHR(mode) (((mode) & S_IFCHR) == S_IFCHR) -#define S_ISBLK(mode) (((mode) & S_IFBLK) == S_IFBLK) -#define S_ISREG(mode) (((mode) & S_IFREG) == S_IFREG) -#define S_ISFIFO(mode) (((mode) & S_IFIFO) == S_IFIFO) -#define S_ISLNK(mode) (((mode) & S_IFLNK) == S_IFLNK) -#define S_ISSOCK(mode) (((mode) & S_IFSOCK) == S_IFSOCK) - -#define DT_UNKNOWN 0 -#define DT_FIFO 1 -#define DT_CHR 2 -#define DT_DIR 4 -#define DT_BLK 6 -#define DT_REG 8 -#define DT_LNK 10 -#define DT_SOCK 12 - -/* all the *at functions */ -#ifndef AT_FDCWD -#define AT_FDCWD -100 -#endif - -/* lseek */ -#define SEEK_SET 0 -#define SEEK_CUR 1 -#define SEEK_END 2 - -/* reboot */ -#define LINUX_REBOOT_MAGIC1 0xfee1dead -#define LINUX_REBOOT_MAGIC2 0x28121969 -#define LINUX_REBOOT_CMD_HALT 0xcdef0123 -#define LINUX_REBOOT_CMD_POWER_OFF 0x4321fedc -#define LINUX_REBOOT_CMD_RESTART 0x01234567 -#define LINUX_REBOOT_CMD_SW_SUSPEND 0xd000fce2 - - -/* The format of the struct as returned by the libc to the application, which - * significantly differs from the format returned by the stat() syscall flavours. - */ -struct stat { - dev_t st_dev; /* ID of device containing file */ - ino_t st_ino; /* inode number */ - mode_t st_mode; /* protection */ - nlink_t st_nlink; /* number of hard links */ - uid_t st_uid; /* user ID of owner */ - gid_t st_gid; /* group ID of owner */ - dev_t st_rdev; /* device ID (if special file) */ - off_t st_size; /* total size, in bytes */ - blksize_t st_blksize; /* blocksize for file system I/O */ - blkcnt_t st_blocks; /* number of 512B blocks allocated */ - time_t st_atime; /* time of last access */ - time_t st_mtime; /* time of last modification */ - time_t st_ctime; /* time of last status change */ -}; - -#define WEXITSTATUS(status) (((status) & 0xff00) >> 8) -#define WIFEXITED(status) (((status) & 0x7f) == 0) - -/* for SIGCHLD */ -#include <asm/signal.h> - -/* Below comes the architecture-specific code. For each architecture, we have - * the syscall declarations and the _start code definition. This is the only - * global part. On all architectures the kernel puts everything in the stack - * before jumping to _start just above us, without any return address (_start - * is not a function but an entry pint). So at the stack pointer we find argc. - * Then argv[] begins, and ends at the first NULL. Then we have envp which - * starts and ends with a NULL as well. So envp=argv+argc+1. - */ - -#if defined(__x86_64__) -/* Syscalls for x86_64 : - * - registers are 64-bit - * - syscall number is passed in rax - * - arguments are in rdi, rsi, rdx, r10, r8, r9 respectively - * - the system call is performed by calling the syscall instruction - * - syscall return comes in rax - * - rcx and r11 are clobbered, others are preserved. - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - * - the syscall number is always specified last in order to allow to force - * some registers before (gcc refuses a %-register at the last position). - * - see also x86-64 ABI section A.2 AMD64 Linux Kernel Conventions, A.2.1 - * Calling Conventions. - * - * Link x86-64 ABI: https://gitlab.com/x86-psABIs/x86-64-ABI/-/wikis/x86-64-psABI - * - */ - -#define my_syscall0(num) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a"(_ret) \ - : "0"(_num) \ - : "rcx", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a"(_ret) \ - : "r"(_arg1), \ - "0"(_num) \ - : "rcx", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a"(_ret) \ - : "r"(_arg1), "r"(_arg2), \ - "0"(_num) \ - : "rcx", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a"(_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ - "0"(_num) \ - : "rcx", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - register long _arg4 asm("r10") = (long)(arg4); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a"(_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "0"(_num) \ - : "rcx", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - register long _arg4 asm("r10") = (long)(arg4); \ - register long _arg5 asm("r8") = (long)(arg5); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a"(_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "0"(_num) \ - : "rcx", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - register long _arg4 asm("r10") = (long)(arg4); \ - register long _arg5 asm("r8") = (long)(arg5); \ - register long _arg6 asm("r9") = (long)(arg6); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a"(_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_arg6), "0"(_num) \ - : "rcx", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -/* startup code */ -/* - * x86-64 System V ABI mandates: - * 1) %rsp must be 16-byte aligned right before the function call. - * 2) The deepest stack frame should be zero (the %rbp). - * - */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" - "pop %rdi\n" // argc (first arg, %rdi) - "mov %rsp, %rsi\n" // argv[] (second arg, %rsi) - "lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx) - "xor %ebp, %ebp\n" // zero the stack frame - "and $-16, %rsp\n" // x86 ABI : esp must be 16-byte aligned before call - "call main\n" // main() returns the status code, we'll exit with it. - "mov %eax, %edi\n" // retrieve exit code (32 bit) - "mov $60, %eax\n" // NR_exit == 60 - "syscall\n" // really exit - "hlt\n" // ensure it does not return - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x40 -#define O_EXCL 0x80 -#define O_NOCTTY 0x100 -#define O_TRUNC 0x200 -#define O_APPEND 0x400 -#define O_NONBLOCK 0x800 -#define O_DIRECTORY 0x10000 - -/* The struct returned by the stat() syscall, equivalent to stat64(). The - * syscall returns 116 bytes and stops in the middle of __unused. - */ -struct sys_stat_struct { - unsigned long st_dev; - unsigned long st_ino; - unsigned long st_nlink; - unsigned int st_mode; - unsigned int st_uid; - - unsigned int st_gid; - unsigned int __pad0; - unsigned long st_rdev; - long st_size; - long st_blksize; - - long st_blocks; - unsigned long st_atime; - unsigned long st_atime_nsec; - unsigned long st_mtime; - - unsigned long st_mtime_nsec; - unsigned long st_ctime; - unsigned long st_ctime_nsec; - long __unused[3]; -}; - -#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) -/* Syscalls for i386 : - * - mostly similar to x86_64 - * - registers are 32-bit - * - syscall number is passed in eax - * - arguments are in ebx, ecx, edx, esi, edi, ebp respectively - * - all registers are preserved (except eax of course) - * - the system call is performed by calling int $0x80 - * - syscall return comes in eax - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - * - the syscall number is always specified last in order to allow to force - * some registers before (gcc refuses a %-register at the last position). - * - * Also, i386 supports the old_select syscall if newselect is not available - */ -#define __ARCH_WANT_SYS_OLD_SELECT - -#define my_syscall0(num) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - register long _arg3 asm("edx") = (long)(arg3); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - register long _arg3 asm("edx") = (long)(arg3); \ - register long _arg4 asm("esi") = (long)(arg4); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - register long _arg3 asm("edx") = (long)(arg3); \ - register long _arg4 asm("esi") = (long)(arg4); \ - register long _arg5 asm("edi") = (long)(arg5); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -/* startup code */ -/* - * i386 System V ABI mandates: - * 1) last pushed argument must be 16-byte aligned. - * 2) The deepest stack frame should be set to zero - * - */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" - "pop %eax\n" // argc (first arg, %eax) - "mov %esp, %ebx\n" // argv[] (second arg, %ebx) - "lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx) - "xor %ebp, %ebp\n" // zero the stack frame - "and $-16, %esp\n" // x86 ABI : esp must be 16-byte aligned before - "sub $4, %esp\n" // the call instruction (args are aligned) - "push %ecx\n" // push all registers on the stack so that we - "push %ebx\n" // support both regparm and plain stack modes - "push %eax\n" - "call main\n" // main() returns the status code in %eax - "mov %eax, %ebx\n" // retrieve exit code (32-bit int) - "movl $1, %eax\n" // NR_exit == 1 - "int $0x80\n" // exit now - "hlt\n" // ensure it does not - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x40 -#define O_EXCL 0x80 -#define O_NOCTTY 0x100 -#define O_TRUNC 0x200 -#define O_APPEND 0x400 -#define O_NONBLOCK 0x800 -#define O_DIRECTORY 0x10000 - -/* The struct returned by the stat() syscall, 32-bit only, the syscall returns - * exactly 56 bytes (stops before the unused array). - */ -struct sys_stat_struct { - unsigned long st_dev; - unsigned long st_ino; - unsigned short st_mode; - unsigned short st_nlink; - unsigned short st_uid; - unsigned short st_gid; - - unsigned long st_rdev; - unsigned long st_size; - unsigned long st_blksize; - unsigned long st_blocks; - - unsigned long st_atime; - unsigned long st_atime_nsec; - unsigned long st_mtime; - unsigned long st_mtime_nsec; - - unsigned long st_ctime; - unsigned long st_ctime_nsec; - unsigned long __unused[2]; -}; - -#elif defined(__ARM_EABI__) -/* Syscalls for ARM in ARM or Thumb modes : - * - registers are 32-bit - * - stack is 8-byte aligned - * ( http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka4127.html) - * - syscall number is passed in r7 - * - arguments are in r0, r1, r2, r3, r4, r5 - * - the system call is performed by calling svc #0 - * - syscall return comes in r0. - * - only lr is clobbered. - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - * - the syscall number is always specified last in order to allow to force - * some registers before (gcc refuses a %-register at the last position). - * - * Also, ARM supports the old_select syscall if newselect is not available - */ -#define __ARCH_WANT_SYS_OLD_SELECT - -#define my_syscall0(num) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0"); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - register long _arg3 asm("r2") = (long)(arg3); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - register long _arg3 asm("r2") = (long)(arg3); \ - register long _arg4 asm("r3") = (long)(arg4); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - register long _arg3 asm("r2") = (long)(arg3); \ - register long _arg4 asm("r3") = (long)(arg4); \ - register long _arg5 asm("r4") = (long)(arg5); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r" (_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -/* startup code */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" -#if defined(__THUMBEB__) || defined(__THUMBEL__) - /* We enter here in 32-bit mode but if some previous functions were in - * 16-bit mode, the assembler cannot know, so we need to tell it we're in - * 32-bit now, then switch to 16-bit (is there a better way to do it than - * adding 1 by hand ?) and tell the asm we're now in 16-bit mode so that - * it generates correct instructions. Note that we do not support thumb1. - */ - ".code 32\n" - "add r0, pc, #1\n" - "bx r0\n" - ".code 16\n" -#endif - "pop {%r0}\n" // argc was in the stack - "mov %r1, %sp\n" // argv = sp - "add %r2, %r1, %r0, lsl #2\n" // envp = argv + 4*argc ... - "add %r2, %r2, $4\n" // ... + 4 - "and %r3, %r1, $-8\n" // AAPCS : sp must be 8-byte aligned in the - "mov %sp, %r3\n" // callee, an bl doesn't push (lr=pc) - "bl main\n" // main() returns the status code, we'll exit with it. - "movs r7, $1\n" // NR_exit == 1 - "svc $0x00\n" - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x40 -#define O_EXCL 0x80 -#define O_NOCTTY 0x100 -#define O_TRUNC 0x200 -#define O_APPEND 0x400 -#define O_NONBLOCK 0x800 -#define O_DIRECTORY 0x4000 - -/* The struct returned by the stat() syscall, 32-bit only, the syscall returns - * exactly 56 bytes (stops before the unused array). In big endian, the format - * differs as devices are returned as short only. - */ -struct sys_stat_struct { -#if defined(__ARMEB__) - unsigned short st_dev; - unsigned short __pad1; -#else - unsigned long st_dev; -#endif - unsigned long st_ino; - unsigned short st_mode; - unsigned short st_nlink; - unsigned short st_uid; - unsigned short st_gid; -#if defined(__ARMEB__) - unsigned short st_rdev; - unsigned short __pad2; -#else - unsigned long st_rdev; -#endif - unsigned long st_size; - unsigned long st_blksize; - unsigned long st_blocks; - unsigned long st_atime; - unsigned long st_atime_nsec; - unsigned long st_mtime; - unsigned long st_mtime_nsec; - unsigned long st_ctime; - unsigned long st_ctime_nsec; - unsigned long __unused[2]; -}; - -#elif defined(__aarch64__) -/* Syscalls for AARCH64 : - * - registers are 64-bit - * - stack is 16-byte aligned - * - syscall number is passed in x8 - * - arguments are in x0, x1, x2, x3, x4, x5 - * - the system call is performed by calling svc 0 - * - syscall return comes in x0. - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - * - * On aarch64, select() is not implemented so we have to use pselect6(). - */ -#define __ARCH_WANT_SYS_PSELECT6 - -#define my_syscall0(num) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0"); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - register long _arg4 asm("x3") = (long)(arg4); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - register long _arg4 asm("x3") = (long)(arg4); \ - register long _arg5 asm("x4") = (long)(arg5); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r" (_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - register long _arg4 asm("x3") = (long)(arg4); \ - register long _arg5 asm("x4") = (long)(arg5); \ - register long _arg6 asm("x5") = (long)(arg6); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r" (_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_arg6), "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -/* startup code */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" - "ldr x0, [sp]\n" // argc (x0) was in the stack - "add x1, sp, 8\n" // argv (x1) = sp - "lsl x2, x0, 3\n" // envp (x2) = 8*argc ... - "add x2, x2, 8\n" // + 8 (skip null) - "add x2, x2, x1\n" // + argv - "and sp, x1, -16\n" // sp must be 16-byte aligned in the callee - "bl main\n" // main() returns the status code, we'll exit with it. - "mov x8, 93\n" // NR_exit == 93 - "svc #0\n" - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x40 -#define O_EXCL 0x80 -#define O_NOCTTY 0x100 -#define O_TRUNC 0x200 -#define O_APPEND 0x400 -#define O_NONBLOCK 0x800 -#define O_DIRECTORY 0x4000 - -/* The struct returned by the newfstatat() syscall. Differs slightly from the - * x86_64's stat one by field ordering, so be careful. - */ -struct sys_stat_struct { - unsigned long st_dev; - unsigned long st_ino; - unsigned int st_mode; - unsigned int st_nlink; - unsigned int st_uid; - unsigned int st_gid; - - unsigned long st_rdev; - unsigned long __pad1; - long st_size; - int st_blksize; - int __pad2; - - long st_blocks; - long st_atime; - unsigned long st_atime_nsec; - long st_mtime; - - unsigned long st_mtime_nsec; - long st_ctime; - unsigned long st_ctime_nsec; - unsigned int __unused[2]; -}; - -#elif defined(__mips__) && defined(_ABIO32) -/* Syscalls for MIPS ABI O32 : - * - WARNING! there's always a delayed slot! - * - WARNING again, the syntax is different, registers take a '$' and numbers - * do not. - * - registers are 32-bit - * - stack is 8-byte aligned - * - syscall number is passed in v0 (starts at 0xfa0). - * - arguments are in a0, a1, a2, a3, then the stack. The caller needs to - * leave some room in the stack for the callee to save a0..a3 if needed. - * - Many registers are clobbered, in fact only a0..a2 and s0..s8 are - * preserved. See: https://www.linux-mips.org/wiki/Syscall as well as - * scall32-o32.S in the kernel sources. - * - the system call is performed by calling "syscall" - * - syscall return comes in v0, and register a3 needs to be checked to know - * if an error occurred, in which case errno is in v0. - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - */ - -#define my_syscall0(num) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg4 asm("a3"); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r"(_num), "=r"(_arg4) \ - : "r"(_num) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg4 asm("a3"); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r"(_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg4 asm("a3"); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r"(_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1), "r"(_arg2) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3"); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r"(_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1), "r"(_arg2), "r"(_arg3) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r" (_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ - register long _arg5 = (long)(arg5); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "sw %7, 16($sp)\n" \ - "syscall\n " \ - "addiu $sp, $sp, 32\n" \ - : "=r" (_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -/* startup code, note that it's called __start on MIPS */ -asm(".section .text\n" - ".set nomips16\n" - ".global __start\n" - ".set noreorder\n" - ".option pic0\n" - ".ent __start\n" - "__start:\n" - "lw $a0,($sp)\n" // argc was in the stack - "addiu $a1, $sp, 4\n" // argv = sp + 4 - "sll $a2, $a0, 2\n" // a2 = argc * 4 - "add $a2, $a2, $a1\n" // envp = argv + 4*argc ... - "addiu $a2, $a2, 4\n" // ... + 4 - "li $t0, -8\n" - "and $sp, $sp, $t0\n" // sp must be 8-byte aligned - "addiu $sp,$sp,-16\n" // the callee expects to save a0..a3 there! - "jal main\n" // main() returns the status code, we'll exit with it. - "nop\n" // delayed slot - "move $a0, $v0\n" // retrieve 32-bit exit code from v0 - "li $v0, 4001\n" // NR_exit == 4001 - "syscall\n" - ".end __start\n" - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_APPEND 0x0008 -#define O_NONBLOCK 0x0080 -#define O_CREAT 0x0100 -#define O_TRUNC 0x0200 -#define O_EXCL 0x0400 -#define O_NOCTTY 0x0800 -#define O_DIRECTORY 0x10000 - -/* The struct returned by the stat() syscall. 88 bytes are returned by the - * syscall. - */ -struct sys_stat_struct { - unsigned int st_dev; - long st_pad1[3]; - unsigned long st_ino; - unsigned int st_mode; - unsigned int st_nlink; - unsigned int st_uid; - unsigned int st_gid; - unsigned int st_rdev; - long st_pad2[2]; - long st_size; - long st_pad3; - long st_atime; - long st_atime_nsec; - long st_mtime; - long st_mtime_nsec; - long st_ctime; - long st_ctime_nsec; - long st_blksize; - long st_blocks; - long st_pad4[14]; -}; - -#elif defined(__riscv) - -#if __riscv_xlen == 64 -#define PTRLOG "3" -#define SZREG "8" -#elif __riscv_xlen == 32 -#define PTRLOG "2" -#define SZREG "4" -#endif - -/* Syscalls for RISCV : - * - stack is 16-byte aligned - * - syscall number is passed in a7 - * - arguments are in a0, a1, a2, a3, a4, a5 - * - the system call is performed by calling ecall - * - syscall return comes in a0 - * - the arguments are cast to long and assigned into the target - * registers which are then simply passed as registers to the asm code, - * so that we don't have to experience issues with register constraints. - */ - -#define my_syscall0(num) \ -({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0"); \ - \ - asm volatile ( \ - "ecall\n\t" \ - : "=r"(_arg1) \ - : "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - \ - asm volatile ( \ - "ecall\n" \ - : "+r"(_arg1) \ - : "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - \ - asm volatile ( \ - "ecall\n" \ - : "+r"(_arg1) \ - : "r"(_arg2), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - \ - asm volatile ( \ - "ecall\n\t" \ - : "+r"(_arg1) \ - : "r"(_arg2), "r"(_arg3), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ - \ - asm volatile ( \ - "ecall\n" \ - : "+r"(_arg1) \ - : "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ - register long _arg5 asm("a4") = (long)(arg5); \ - \ - asm volatile ( \ - "ecall\n" \ - : "+r"(_arg1) \ - : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ -({ \ - register long _num asm("a7") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ - register long _arg5 asm("a4") = (long)(arg5); \ - register long _arg6 asm("a5") = (long)(arg6); \ - \ - asm volatile ( \ - "ecall\n" \ - : "+r"(_arg1) \ - : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), "r"(_arg6), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -/* startup code */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" - ".option push\n" - ".option norelax\n" - "lla gp, __global_pointer$\n" - ".option pop\n" - "ld a0, 0(sp)\n" // argc (a0) was in the stack - "add a1, sp, "SZREG"\n" // argv (a1) = sp - "slli a2, a0, "PTRLOG"\n" // envp (a2) = SZREG*argc ... - "add a2, a2, "SZREG"\n" // + SZREG (skip null) - "add a2,a2,a1\n" // + argv - "andi sp,a1,-16\n" // sp must be 16-byte aligned - "call main\n" // main() returns the status code, we'll exit with it. - "li a7, 93\n" // NR_exit == 93 - "ecall\n" - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x100 -#define O_EXCL 0x200 -#define O_NOCTTY 0x400 -#define O_TRUNC 0x1000 -#define O_APPEND 0x2000 -#define O_NONBLOCK 0x4000 -#define O_DIRECTORY 0x200000 - -struct sys_stat_struct { - unsigned long st_dev; /* Device. */ - unsigned long st_ino; /* File serial number. */ - unsigned int st_mode; /* File mode. */ - unsigned int st_nlink; /* Link count. */ - unsigned int st_uid; /* User ID of the file's owner. */ - unsigned int st_gid; /* Group ID of the file's group. */ - unsigned long st_rdev; /* Device number, if device. */ - unsigned long __pad1; - long st_size; /* Size of file, in bytes. */ - int st_blksize; /* Optimal block size for I/O. */ - int __pad2; - long st_blocks; /* Number 512-byte blocks allocated. */ - long st_atime; /* Time of last access. */ - unsigned long st_atime_nsec; - long st_mtime; /* Time of last modification. */ - unsigned long st_mtime_nsec; - long st_ctime; /* Time of last status change. */ - unsigned long st_ctime_nsec; - unsigned int __unused4; - unsigned int __unused5; -}; - -#endif - - -/* Below are the C functions used to declare the raw syscalls. They try to be - * architecture-agnostic, and return either a success or -errno. Declaring them - * static will lead to them being inlined in most cases, but it's still possible - * to reference them by a pointer if needed. - */ -static __attribute__((unused)) -void *sys_brk(void *addr) -{ - return (void *)my_syscall1(__NR_brk, addr); -} - -static __attribute__((noreturn,unused)) -void sys_exit(int status) -{ - my_syscall1(__NR_exit, status & 255); - while(1); // shut the "noreturn" warnings. -} - -static __attribute__((unused)) -int sys_chdir(const char *path) -{ - return my_syscall1(__NR_chdir, path); -} - -static __attribute__((unused)) -int sys_chmod(const char *path, mode_t mode) -{ -#ifdef __NR_fchmodat - return my_syscall4(__NR_fchmodat, AT_FDCWD, path, mode, 0); -#elif defined(__NR_chmod) - return my_syscall2(__NR_chmod, path, mode); -#else -#error Neither __NR_fchmodat nor __NR_chmod defined, cannot implement sys_chmod() -#endif -} - -static __attribute__((unused)) -int sys_chown(const char *path, uid_t owner, gid_t group) -{ -#ifdef __NR_fchownat - return my_syscall5(__NR_fchownat, AT_FDCWD, path, owner, group, 0); -#elif defined(__NR_chown) - return my_syscall3(__NR_chown, path, owner, group); -#else -#error Neither __NR_fchownat nor __NR_chown defined, cannot implement sys_chown() -#endif -} - -static __attribute__((unused)) -int sys_chroot(const char *path) -{ - return my_syscall1(__NR_chroot, path); -} - -static __attribute__((unused)) -int sys_close(int fd) -{ - return my_syscall1(__NR_close, fd); -} - -static __attribute__((unused)) -int sys_dup(int fd) -{ - return my_syscall1(__NR_dup, fd); -} - -#ifdef __NR_dup3 -static __attribute__((unused)) -int sys_dup3(int old, int new, int flags) -{ - return my_syscall3(__NR_dup3, old, new, flags); -} -#endif - -static __attribute__((unused)) -int sys_dup2(int old, int new) -{ -#ifdef __NR_dup3 - return my_syscall3(__NR_dup3, old, new, 0); -#elif defined(__NR_dup2) - return my_syscall2(__NR_dup2, old, new); -#else -#error Neither __NR_dup3 nor __NR_dup2 defined, cannot implement sys_dup2() -#endif -} - -static __attribute__((unused)) -int sys_execve(const char *filename, char *const argv[], char *const envp[]) -{ - return my_syscall3(__NR_execve, filename, argv, envp); -} - -static __attribute__((unused)) -pid_t sys_fork(void) -{ -#ifdef __NR_clone - /* note: some archs only have clone() and not fork(). Different archs - * have a different API, but most archs have the flags on first arg and - * will not use the rest with no other flag. - */ - return my_syscall5(__NR_clone, SIGCHLD, 0, 0, 0, 0); -#elif defined(__NR_fork) - return my_syscall0(__NR_fork); -#else -#error Neither __NR_clone nor __NR_fork defined, cannot implement sys_fork() -#endif -} - -static __attribute__((unused)) -int sys_fsync(int fd) -{ - return my_syscall1(__NR_fsync, fd); -} - -static __attribute__((unused)) -int sys_getdents64(int fd, struct linux_dirent64 *dirp, int count) -{ - return my_syscall3(__NR_getdents64, fd, dirp, count); -} - -static __attribute__((unused)) -pid_t sys_getpgid(pid_t pid) -{ - return my_syscall1(__NR_getpgid, pid); -} - -static __attribute__((unused)) -pid_t sys_getpgrp(void) -{ - return sys_getpgid(0); -} - -static __attribute__((unused)) -pid_t sys_getpid(void) -{ - return my_syscall0(__NR_getpid); -} - -static __attribute__((unused)) -pid_t sys_gettid(void) -{ - return my_syscall0(__NR_gettid); -} - -static __attribute__((unused)) -int sys_gettimeofday(struct timeval *tv, struct timezone *tz) -{ - return my_syscall2(__NR_gettimeofday, tv, tz); -} - -static __attribute__((unused)) -int sys_ioctl(int fd, unsigned long req, void *value) -{ - return my_syscall3(__NR_ioctl, fd, req, value); -} - -static __attribute__((unused)) -int sys_kill(pid_t pid, int signal) -{ - return my_syscall2(__NR_kill, pid, signal); -} - -static __attribute__((unused)) -int sys_link(const char *old, const char *new) -{ -#ifdef __NR_linkat - return my_syscall5(__NR_linkat, AT_FDCWD, old, AT_FDCWD, new, 0); -#elif defined(__NR_link) - return my_syscall2(__NR_link, old, new); -#else -#error Neither __NR_linkat nor __NR_link defined, cannot implement sys_link() -#endif -} - -static __attribute__((unused)) -off_t sys_lseek(int fd, off_t offset, int whence) -{ - return my_syscall3(__NR_lseek, fd, offset, whence); -} - -static __attribute__((unused)) -int sys_mkdir(const char *path, mode_t mode) -{ -#ifdef __NR_mkdirat - return my_syscall3(__NR_mkdirat, AT_FDCWD, path, mode); -#elif defined(__NR_mkdir) - return my_syscall2(__NR_mkdir, path, mode); -#else -#error Neither __NR_mkdirat nor __NR_mkdir defined, cannot implement sys_mkdir() -#endif -} - -static __attribute__((unused)) -long sys_mknod(const char *path, mode_t mode, dev_t dev) -{ -#ifdef __NR_mknodat - return my_syscall4(__NR_mknodat, AT_FDCWD, path, mode, dev); -#elif defined(__NR_mknod) - return my_syscall3(__NR_mknod, path, mode, dev); -#else -#error Neither __NR_mknodat nor __NR_mknod defined, cannot implement sys_mknod() -#endif -} - -static __attribute__((unused)) -int sys_mount(const char *src, const char *tgt, const char *fst, - unsigned long flags, const void *data) -{ - return my_syscall5(__NR_mount, src, tgt, fst, flags, data); -} - -static __attribute__((unused)) -int sys_open(const char *path, int flags, mode_t mode) -{ -#ifdef __NR_openat - return my_syscall4(__NR_openat, AT_FDCWD, path, flags, mode); -#elif defined(__NR_open) - return my_syscall3(__NR_open, path, flags, mode); -#else -#error Neither __NR_openat nor __NR_open defined, cannot implement sys_open() -#endif -} - -static __attribute__((unused)) -int sys_pivot_root(const char *new, const char *old) -{ - return my_syscall2(__NR_pivot_root, new, old); -} - -static __attribute__((unused)) -int sys_poll(struct pollfd *fds, int nfds, int timeout) -{ -#if defined(__NR_ppoll) - struct timespec t; - - if (timeout >= 0) { - t.tv_sec = timeout / 1000; - t.tv_nsec = (timeout % 1000) * 1000000; - } - return my_syscall4(__NR_ppoll, fds, nfds, (timeout >= 0) ? &t : NULL, NULL); -#elif defined(__NR_poll) - return my_syscall3(__NR_poll, fds, nfds, timeout); -#else -#error Neither __NR_ppoll nor __NR_poll defined, cannot implement sys_poll() -#endif -} - -static __attribute__((unused)) -ssize_t sys_read(int fd, void *buf, size_t count) -{ - return my_syscall3(__NR_read, fd, buf, count); -} - -static __attribute__((unused)) -ssize_t sys_reboot(int magic1, int magic2, int cmd, void *arg) -{ - return my_syscall4(__NR_reboot, magic1, magic2, cmd, arg); -} - -static __attribute__((unused)) -int sys_sched_yield(void) -{ - return my_syscall0(__NR_sched_yield); -} - -static __attribute__((unused)) -int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) -{ -#if defined(__ARCH_WANT_SYS_OLD_SELECT) && !defined(__NR__newselect) - struct sel_arg_struct { - unsigned long n; - fd_set *r, *w, *e; - struct timeval *t; - } arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout }; - return my_syscall1(__NR_select, &arg); -#elif defined(__ARCH_WANT_SYS_PSELECT6) && defined(__NR_pselect6) - struct timespec t; - - if (timeout) { - t.tv_sec = timeout->tv_sec; - t.tv_nsec = timeout->tv_usec * 1000; - } - return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); -#elif defined(__NR__newselect) || defined(__NR_select) -#ifndef __NR__newselect -#define __NR__newselect __NR_select -#endif - return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout); -#else -#error None of __NR_select, __NR_pselect6, nor __NR__newselect defined, cannot implement sys_select() -#endif -} - -static __attribute__((unused)) -int sys_setpgid(pid_t pid, pid_t pgid) -{ - return my_syscall2(__NR_setpgid, pid, pgid); -} - -static __attribute__((unused)) -pid_t sys_setsid(void) -{ - return my_syscall0(__NR_setsid); -} - -static __attribute__((unused)) -int sys_stat(const char *path, struct stat *buf) -{ - struct sys_stat_struct stat; - long ret; - -#ifdef __NR_newfstatat - /* only solution for arm64 */ - ret = my_syscall4(__NR_newfstatat, AT_FDCWD, path, &stat, 0); -#elif defined(__NR_stat) - ret = my_syscall2(__NR_stat, path, &stat); -#else -#error Neither __NR_newfstatat nor __NR_stat defined, cannot implement sys_stat() -#endif - buf->st_dev = stat.st_dev; - buf->st_ino = stat.st_ino; - buf->st_mode = stat.st_mode; - buf->st_nlink = stat.st_nlink; - buf->st_uid = stat.st_uid; - buf->st_gid = stat.st_gid; - buf->st_rdev = stat.st_rdev; - buf->st_size = stat.st_size; - buf->st_blksize = stat.st_blksize; - buf->st_blocks = stat.st_blocks; - buf->st_atime = stat.st_atime; - buf->st_mtime = stat.st_mtime; - buf->st_ctime = stat.st_ctime; - return ret; -} - - -static __attribute__((unused)) -int sys_symlink(const char *old, const char *new) -{ -#ifdef __NR_symlinkat - return my_syscall3(__NR_symlinkat, old, AT_FDCWD, new); -#elif defined(__NR_symlink) - return my_syscall2(__NR_symlink, old, new); -#else -#error Neither __NR_symlinkat nor __NR_symlink defined, cannot implement sys_symlink() -#endif -} - -static __attribute__((unused)) -mode_t sys_umask(mode_t mode) -{ - return my_syscall1(__NR_umask, mode); -} - -static __attribute__((unused)) -int sys_umount2(const char *path, int flags) -{ - return my_syscall2(__NR_umount2, path, flags); -} - -static __attribute__((unused)) -int sys_unlink(const char *path) -{ -#ifdef __NR_unlinkat - return my_syscall3(__NR_unlinkat, AT_FDCWD, path, 0); -#elif defined(__NR_unlink) - return my_syscall1(__NR_unlink, path); -#else -#error Neither __NR_unlinkat nor __NR_unlink defined, cannot implement sys_unlink() -#endif -} - -static __attribute__((unused)) -pid_t sys_wait4(pid_t pid, int *status, int options, struct rusage *rusage) -{ - return my_syscall4(__NR_wait4, pid, status, options, rusage); -} - -static __attribute__((unused)) -pid_t sys_waitpid(pid_t pid, int *status, int options) -{ - return sys_wait4(pid, status, options, 0); -} - -static __attribute__((unused)) -pid_t sys_wait(int *status) -{ - return sys_waitpid(-1, status, 0); -} - -static __attribute__((unused)) -ssize_t sys_write(int fd, const void *buf, size_t count) -{ - return my_syscall3(__NR_write, fd, buf, count); -} - - -/* Below are the libc-compatible syscalls which return x or -1 and set errno. - * They rely on the functions above. Similarly they're marked static so that it - * is possible to assign pointers to them if needed. - */ - -static __attribute__((unused)) -int brk(void *addr) -{ - void *ret = sys_brk(addr); - - if (!ret) { - SET_ERRNO(ENOMEM); - return -1; - } - return 0; -} - -static __attribute__((noreturn,unused)) -void exit(int status) -{ - sys_exit(status); -} - -static __attribute__((unused)) -int chdir(const char *path) -{ - int ret = sys_chdir(path); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int chmod(const char *path, mode_t mode) -{ - int ret = sys_chmod(path, mode); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int chown(const char *path, uid_t owner, gid_t group) -{ - int ret = sys_chown(path, owner, group); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int chroot(const char *path) -{ - int ret = sys_chroot(path); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int close(int fd) -{ - int ret = sys_close(fd); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int dup(int fd) -{ - int ret = sys_dup(fd); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int dup2(int old, int new) -{ - int ret = sys_dup2(old, new); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -#ifdef __NR_dup3 -static __attribute__((unused)) -int dup3(int old, int new, int flags) -{ - int ret = sys_dup3(old, new, flags); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} -#endif - -static __attribute__((unused)) -int execve(const char *filename, char *const argv[], char *const envp[]) -{ - int ret = sys_execve(filename, argv, envp); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t fork(void) -{ - pid_t ret = sys_fork(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int fsync(int fd) -{ - int ret = sys_fsync(fd); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int getdents64(int fd, struct linux_dirent64 *dirp, int count) -{ - int ret = sys_getdents64(fd, dirp, count); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t getpgid(pid_t pid) -{ - pid_t ret = sys_getpgid(pid); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t getpgrp(void) -{ - pid_t ret = sys_getpgrp(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t getpid(void) -{ - pid_t ret = sys_getpid(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t gettid(void) -{ - pid_t ret = sys_gettid(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int gettimeofday(struct timeval *tv, struct timezone *tz) -{ - int ret = sys_gettimeofday(tv, tz); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int ioctl(int fd, unsigned long req, void *value) -{ - int ret = sys_ioctl(fd, req, value); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int kill(pid_t pid, int signal) -{ - int ret = sys_kill(pid, signal); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int link(const char *old, const char *new) -{ - int ret = sys_link(old, new); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -off_t lseek(int fd, off_t offset, int whence) -{ - off_t ret = sys_lseek(fd, offset, whence); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int mkdir(const char *path, mode_t mode) -{ - int ret = sys_mkdir(path, mode); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int mknod(const char *path, mode_t mode, dev_t dev) -{ - int ret = sys_mknod(path, mode, dev); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int mount(const char *src, const char *tgt, - const char *fst, unsigned long flags, - const void *data) -{ - int ret = sys_mount(src, tgt, fst, flags, data); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int open(const char *path, int flags, mode_t mode) -{ - int ret = sys_open(path, flags, mode); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int pivot_root(const char *new, const char *old) -{ - int ret = sys_pivot_root(new, old); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int poll(struct pollfd *fds, int nfds, int timeout) -{ - int ret = sys_poll(fds, nfds, timeout); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -ssize_t read(int fd, void *buf, size_t count) -{ - ssize_t ret = sys_read(fd, buf, count); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int reboot(int cmd) -{ - int ret = sys_reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, cmd, 0); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -void *sbrk(intptr_t inc) -{ - void *ret; - - /* first call to find current end */ - if ((ret = sys_brk(0)) && (sys_brk(ret + inc) == ret + inc)) - return ret + inc; - - SET_ERRNO(ENOMEM); - return (void *)-1; -} - -static __attribute__((unused)) -int sched_yield(void) -{ - int ret = sys_sched_yield(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) -{ - int ret = sys_select(nfds, rfds, wfds, efds, timeout); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int setpgid(pid_t pid, pid_t pgid) -{ - int ret = sys_setpgid(pid, pgid); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t setsid(void) -{ - pid_t ret = sys_setsid(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -unsigned int sleep(unsigned int seconds) -{ - struct timeval my_timeval = { seconds, 0 }; - - if (sys_select(0, 0, 0, 0, &my_timeval) < 0) - return my_timeval.tv_sec + !!my_timeval.tv_usec; - else - return 0; -} - -static __attribute__((unused)) -int msleep(unsigned int msecs) -{ - struct timeval my_timeval = { msecs / 1000, (msecs % 1000) * 1000 }; - - if (sys_select(0, 0, 0, 0, &my_timeval) < 0) - return (my_timeval.tv_sec * 1000) + - (my_timeval.tv_usec / 1000) + - !!(my_timeval.tv_usec % 1000); - else - return 0; -} - -static __attribute__((unused)) -int stat(const char *path, struct stat *buf) -{ - int ret = sys_stat(path, buf); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int symlink(const char *old, const char *new) -{ - int ret = sys_symlink(old, new); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int tcsetpgrp(int fd, pid_t pid) -{ - return ioctl(fd, TIOCSPGRP, &pid); -} - -static __attribute__((unused)) -mode_t umask(mode_t mode) -{ - return sys_umask(mode); -} - -static __attribute__((unused)) -int umount2(const char *path, int flags) -{ - int ret = sys_umount2(path, flags); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int unlink(const char *path) -{ - int ret = sys_unlink(path); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t wait4(pid_t pid, int *status, int options, struct rusage *rusage) -{ - pid_t ret = sys_wait4(pid, status, options, rusage); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t waitpid(pid_t pid, int *status, int options) -{ - pid_t ret = sys_waitpid(pid, status, options); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t wait(int *status) -{ - pid_t ret = sys_wait(status); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -ssize_t write(int fd, const void *buf, size_t count) -{ - ssize_t ret = sys_write(fd, buf, count); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -/* some size-optimized reimplementations of a few common str* and mem* - * functions. They're marked static, except memcpy() and raise() which are used - * by libgcc on ARM, so they are marked weak instead in order not to cause an - * error when building a program made of multiple files (not recommended). - */ - -static __attribute__((unused)) -void *memmove(void *dst, const void *src, size_t len) -{ - ssize_t pos = (dst <= src) ? -1 : (long)len; - void *ret = dst; - - while (len--) { - pos += (dst <= src) ? 1 : -1; - ((char *)dst)[pos] = ((char *)src)[pos]; - } - return ret; -} - -static __attribute__((unused)) -void *memset(void *dst, int b, size_t len) -{ - char *p = dst; - - while (len--) - *(p++) = b; - return dst; -} - -static __attribute__((unused)) -int memcmp(const void *s1, const void *s2, size_t n) -{ - size_t ofs = 0; - char c1 = 0; - - while (ofs < n && !(c1 = ((char *)s1)[ofs] - ((char *)s2)[ofs])) { - ofs++; - } - return c1; -} - -static __attribute__((unused)) -char *strcpy(char *dst, const char *src) -{ - char *ret = dst; - - while ((*dst++ = *src++)); - return ret; -} - -static __attribute__((unused)) -char *strchr(const char *s, int c) -{ - while (*s) { - if (*s == (char)c) - return (char *)s; - s++; - } - return NULL; -} - -static __attribute__((unused)) -char *strrchr(const char *s, int c) -{ - const char *ret = NULL; - - while (*s) { - if (*s == (char)c) - ret = s; - s++; - } - return (char *)ret; -} - -static __attribute__((unused)) -size_t nolibc_strlen(const char *str) -{ - size_t len; - - for (len = 0; str[len]; len++); - return len; -} - -#define strlen(str) ({ \ - __builtin_constant_p((str)) ? \ - __builtin_strlen((str)) : \ - nolibc_strlen((str)); \ -}) - -static __attribute__((unused)) -int isdigit(int c) -{ - return (unsigned int)(c - '0') <= 9; -} - -static __attribute__((unused)) -long atol(const char *s) -{ - unsigned long ret = 0; - unsigned long d; - int neg = 0; - - if (*s == '-') { - neg = 1; - s++; - } - - while (1) { - d = (*s++) - '0'; - if (d > 9) - break; - ret *= 10; - ret += d; - } - - return neg ? -ret : ret; -} - -static __attribute__((unused)) -int atoi(const char *s) -{ - return atol(s); -} - -static __attribute__((unused)) -const char *ltoa(long in) -{ - /* large enough for -9223372036854775808 */ - static char buffer[21]; - char *pos = buffer + sizeof(buffer) - 1; - int neg = in < 0; - unsigned long n = neg ? -in : in; - - *pos-- = '\0'; - do { - *pos-- = '0' + n % 10; - n /= 10; - if (pos < buffer) - return pos + 1; - } while (n); - - if (neg) - *pos-- = '-'; - return pos + 1; -} - -__attribute__((weak,unused)) -void *memcpy(void *dst, const void *src, size_t len) -{ - return memmove(dst, src, len); -} - -/* needed by libgcc for divide by zero */ -__attribute__((weak,unused)) -int raise(int signal) -{ - return kill(getpid(), signal); -} - -/* Here come a few helper functions */ - -static __attribute__((unused)) -void FD_ZERO(fd_set *set) -{ - memset(set, 0, sizeof(*set)); -} - -static __attribute__((unused)) -void FD_SET(int fd, fd_set *set) -{ - if (fd < 0 || fd >= FD_SETSIZE) - return; - set->fd32[fd / 32] |= 1 << (fd & 31); -} - -/* WARNING, it only deals with the 4096 first majors and 256 first minors */ -static __attribute__((unused)) -dev_t makedev(unsigned int major, unsigned int minor) -{ - return ((major & 0xfff) << 8) | (minor & 0xff); -} +#endif /* _NOLIBC_H */ diff --git a/tools/include/nolibc/signal.h b/tools/include/nolibc/signal.h new file mode 100644 index 000000000000..ef47e71e2be3 --- /dev/null +++ b/tools/include/nolibc/signal.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * signal function definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_SIGNAL_H +#define _NOLIBC_SIGNAL_H + +#include "std.h" +#include "arch.h" +#include "types.h" +#include "sys.h" + +/* This one is not marked static as it's needed by libgcc for divide by zero */ +__attribute__((weak,unused,section(".text.nolibc_raise"))) +int raise(int signal) +{ + return sys_kill(sys_getpid(), signal); +} + +#endif /* _NOLIBC_SIGNAL_H */ diff --git a/tools/include/nolibc/std.h b/tools/include/nolibc/std.h new file mode 100644 index 000000000000..1747ae125392 --- /dev/null +++ b/tools/include/nolibc/std.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Standard definitions and types for NOLIBC + * Copyright (C) 2017-2021 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_STD_H +#define _NOLIBC_STD_H + +/* Declare a few quite common macros and types that usually are in stdlib.h, + * stdint.h, ctype.h, unistd.h and a few other common locations. Please place + * integer type definitions and generic macros here, but avoid OS-specific and + * syscall-specific stuff, as this file is expected to be included very early. + */ + +/* note: may already be defined */ +#ifndef NULL +#define NULL ((void *)0) +#endif + +/* stdint types */ +typedef unsigned char uint8_t; +typedef signed char int8_t; +typedef unsigned short uint16_t; +typedef signed short int16_t; +typedef unsigned int uint32_t; +typedef signed int int32_t; +typedef unsigned long long uint64_t; +typedef signed long long int64_t; +typedef unsigned long size_t; +typedef signed long ssize_t; +typedef unsigned long uintptr_t; +typedef signed long intptr_t; +typedef signed long ptrdiff_t; + +/* those are commonly provided by sys/types.h */ +typedef unsigned int dev_t; +typedef unsigned long ino_t; +typedef unsigned int mode_t; +typedef signed int pid_t; +typedef unsigned int uid_t; +typedef unsigned int gid_t; +typedef unsigned long nlink_t; +typedef signed long off_t; +typedef signed long blksize_t; +typedef signed long blkcnt_t; +typedef signed long time_t; + +#endif /* _NOLIBC_STD_H */ diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h new file mode 100644 index 000000000000..15dedf8d0902 --- /dev/null +++ b/tools/include/nolibc/stdio.h @@ -0,0 +1,306 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * minimal stdio function definitions for NOLIBC + * Copyright (C) 2017-2021 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_STDIO_H +#define _NOLIBC_STDIO_H + +#include <stdarg.h> + +#include "std.h" +#include "arch.h" +#include "errno.h" +#include "types.h" +#include "sys.h" +#include "stdlib.h" +#include "string.h" + +#ifndef EOF +#define EOF (-1) +#endif + +/* just define FILE as a non-empty type */ +typedef struct FILE { + char dummy[1]; +} FILE; + +/* We define the 3 common stdio files as constant invalid pointers that + * are easily recognized. + */ +static __attribute__((unused)) FILE* const stdin = (FILE*)-3; +static __attribute__((unused)) FILE* const stdout = (FILE*)-2; +static __attribute__((unused)) FILE* const stderr = (FILE*)-1; + +/* getc(), fgetc(), getchar() */ + +#define getc(stream) fgetc(stream) + +static __attribute__((unused)) +int fgetc(FILE* stream) +{ + unsigned char ch; + int fd; + + if (stream < stdin || stream > stderr) + return EOF; + + fd = 3 + (long)stream; + + if (read(fd, &ch, 1) <= 0) + return EOF; + return ch; +} + +static __attribute__((unused)) +int getchar(void) +{ + return fgetc(stdin); +} + + +/* putc(), fputc(), putchar() */ + +#define putc(c, stream) fputc(c, stream) + +static __attribute__((unused)) +int fputc(int c, FILE* stream) +{ + unsigned char ch = c; + int fd; + + if (stream < stdin || stream > stderr) + return EOF; + + fd = 3 + (long)stream; + + if (write(fd, &ch, 1) <= 0) + return EOF; + return ch; +} + +static __attribute__((unused)) +int putchar(int c) +{ + return fputc(c, stdout); +} + + +/* fwrite(), puts(), fputs(). Note that puts() emits '\n' but not fputs(). */ + +/* internal fwrite()-like function which only takes a size and returns 0 on + * success or EOF on error. It automatically retries on short writes. + */ +static __attribute__((unused)) +int _fwrite(const void *buf, size_t size, FILE *stream) +{ + ssize_t ret; + int fd; + + if (stream < stdin || stream > stderr) + return EOF; + + fd = 3 + (long)stream; + + while (size) { + ret = write(fd, buf, size); + if (ret <= 0) + return EOF; + size -= ret; + buf += ret; + } + return 0; +} + +static __attribute__((unused)) +size_t fwrite(const void *s, size_t size, size_t nmemb, FILE *stream) +{ + size_t written; + + for (written = 0; written < nmemb; written++) { + if (_fwrite(s, size, stream) != 0) + break; + s += size; + } + return written; +} + +static __attribute__((unused)) +int fputs(const char *s, FILE *stream) +{ + return _fwrite(s, strlen(s), stream); +} + +static __attribute__((unused)) +int puts(const char *s) +{ + if (fputs(s, stdout) == EOF) + return EOF; + return putchar('\n'); +} + + +/* fgets() */ +static __attribute__((unused)) +char *fgets(char *s, int size, FILE *stream) +{ + int ofs; + int c; + + for (ofs = 0; ofs + 1 < size;) { + c = fgetc(stream); + if (c == EOF) + break; + s[ofs++] = c; + if (c == '\n') + break; + } + if (ofs < size) + s[ofs] = 0; + return ofs ? s : NULL; +} + + +/* minimal vfprintf(). It supports the following formats: + * - %[l*]{d,u,c,x,p} + * - %s + * - unknown modifiers are ignored. + */ +static __attribute__((unused)) +int vfprintf(FILE *stream, const char *fmt, va_list args) +{ + char escape, lpref, c; + unsigned long long v; + unsigned int written; + size_t len, ofs; + char tmpbuf[21]; + const char *outstr; + + written = ofs = escape = lpref = 0; + while (1) { + c = fmt[ofs++]; + + if (escape) { + /* we're in an escape sequence, ofs == 1 */ + escape = 0; + if (c == 'c' || c == 'd' || c == 'u' || c == 'x' || c == 'p') { + char *out = tmpbuf; + + if (c == 'p') + v = va_arg(args, unsigned long); + else if (lpref) { + if (lpref > 1) + v = va_arg(args, unsigned long long); + else + v = va_arg(args, unsigned long); + } else + v = va_arg(args, unsigned int); + + if (c == 'd') { + /* sign-extend the value */ + if (lpref == 0) + v = (long long)(int)v; + else if (lpref == 1) + v = (long long)(long)v; + } + + switch (c) { + case 'c': + out[0] = v; + out[1] = 0; + break; + case 'd': + i64toa_r(v, out); + break; + case 'u': + u64toa_r(v, out); + break; + case 'p': + *(out++) = '0'; + *(out++) = 'x'; + /* fall through */ + default: /* 'x' and 'p' above */ + u64toh_r(v, out); + break; + } + outstr = tmpbuf; + } + else if (c == 's') { + outstr = va_arg(args, char *); + if (!outstr) + outstr="(null)"; + } + else if (c == '%') { + /* queue it verbatim */ + continue; + } + else { + /* modifiers or final 0 */ + if (c == 'l') { + /* long format prefix, maintain the escape */ + lpref++; + } + escape = 1; + goto do_escape; + } + len = strlen(outstr); + goto flush_str; + } + + /* not an escape sequence */ + if (c == 0 || c == '%') { + /* flush pending data on escape or end */ + escape = 1; + lpref = 0; + outstr = fmt; + len = ofs - 1; + flush_str: + if (_fwrite(outstr, len, stream) != 0) + break; + + written += len; + do_escape: + if (c == 0) + break; + fmt += ofs; + ofs = 0; + continue; + } + + /* literal char, just queue it */ + } + return written; +} + +static __attribute__((unused)) +int fprintf(FILE *stream, const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = vfprintf(stream, fmt, args); + va_end(args); + return ret; +} + +static __attribute__((unused)) +int printf(const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = vfprintf(stdout, fmt, args); + va_end(args); + return ret; +} + +static __attribute__((unused)) +void perror(const char *msg) +{ + fprintf(stderr, "%s%serrno=%d\n", (msg && *msg) ? msg : "", (msg && *msg) ? ": " : "", errno); +} + +#endif /* _NOLIBC_STDIO_H */ diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h new file mode 100644 index 000000000000..8fd32eaf8037 --- /dev/null +++ b/tools/include/nolibc/stdlib.h @@ -0,0 +1,423 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * stdlib function definitions for NOLIBC + * Copyright (C) 2017-2021 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_STDLIB_H +#define _NOLIBC_STDLIB_H + +#include "std.h" +#include "arch.h" +#include "types.h" +#include "sys.h" +#include "string.h" + +struct nolibc_heap { + size_t len; + char user_p[] __attribute__((__aligned__)); +}; + +/* Buffer used to store int-to-ASCII conversions. Will only be implemented if + * any of the related functions is implemented. The area is large enough to + * store "18446744073709551615" or "-9223372036854775808" and the final zero. + */ +static __attribute__((unused)) char itoa_buffer[21]; + +/* + * As much as possible, please keep functions alphabetically sorted. + */ + +/* must be exported, as it's used by libgcc for various divide functions */ +__attribute__((weak,unused,noreturn,section(".text.nolibc_abort"))) +void abort(void) +{ + sys_kill(sys_getpid(), SIGABRT); + for (;;); +} + +static __attribute__((unused)) +long atol(const char *s) +{ + unsigned long ret = 0; + unsigned long d; + int neg = 0; + + if (*s == '-') { + neg = 1; + s++; + } + + while (1) { + d = (*s++) - '0'; + if (d > 9) + break; + ret *= 10; + ret += d; + } + + return neg ? -ret : ret; +} + +static __attribute__((unused)) +int atoi(const char *s) +{ + return atol(s); +} + +static __attribute__((unused)) +void free(void *ptr) +{ + struct nolibc_heap *heap; + + if (!ptr) + return; + + heap = container_of(ptr, struct nolibc_heap, user_p); + munmap(heap, heap->len); +} + +/* getenv() tries to find the environment variable named <name> in the + * environment array pointed to by global variable "environ" which must be + * declared as a char **, and must be terminated by a NULL (it is recommended + * to set this variable to the "envp" argument of main()). If the requested + * environment variable exists its value is returned otherwise NULL is + * returned. getenv() is forcefully inlined so that the reference to "environ" + * will be dropped if unused, even at -O0. + */ +static __attribute__((unused)) +char *_getenv(const char *name, char **environ) +{ + int idx, i; + + if (environ) { + for (idx = 0; environ[idx]; idx++) { + for (i = 0; name[i] && name[i] == environ[idx][i];) + i++; + if (!name[i] && environ[idx][i] == '=') + return &environ[idx][i+1]; + } + } + return NULL; +} + +static inline __attribute__((unused,always_inline)) +char *getenv(const char *name) +{ + extern char **environ; + return _getenv(name, environ); +} + +static __attribute__((unused)) +void *malloc(size_t len) +{ + struct nolibc_heap *heap; + + /* Always allocate memory with size multiple of 4096. */ + len = sizeof(*heap) + len; + len = (len + 4095UL) & -4096UL; + heap = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, + -1, 0); + if (__builtin_expect(heap == MAP_FAILED, 0)) + return NULL; + + heap->len = len; + return heap->user_p; +} + +static __attribute__((unused)) +void *calloc(size_t size, size_t nmemb) +{ + void *orig; + size_t res = 0; + + if (__builtin_expect(__builtin_mul_overflow(nmemb, size, &res), 0)) { + SET_ERRNO(ENOMEM); + return NULL; + } + + /* + * No need to zero the heap, the MAP_ANONYMOUS in malloc() + * already does it. + */ + return malloc(res); +} + +static __attribute__((unused)) +void *realloc(void *old_ptr, size_t new_size) +{ + struct nolibc_heap *heap; + size_t user_p_len; + void *ret; + + if (!old_ptr) + return malloc(new_size); + + heap = container_of(old_ptr, struct nolibc_heap, user_p); + user_p_len = heap->len - sizeof(*heap); + /* + * Don't realloc() if @user_p_len >= @new_size, this block of + * memory is still enough to handle the @new_size. Just return + * the same pointer. + */ + if (user_p_len >= new_size) + return old_ptr; + + ret = malloc(new_size); + if (__builtin_expect(!ret, 0)) + return NULL; + + memcpy(ret, heap->user_p, heap->len); + munmap(heap, heap->len); + return ret; +} + +/* Converts the unsigned long integer <in> to its hex representation into + * buffer <buffer>, which must be long enough to store the number and the + * trailing zero (17 bytes for "ffffffffffffffff" or 9 for "ffffffff"). The + * buffer is filled from the first byte, and the number of characters emitted + * (not counting the trailing zero) is returned. The function is constructed + * in a way to optimize the code size and avoid any divide that could add a + * dependency on large external functions. + */ +static __attribute__((unused)) +int utoh_r(unsigned long in, char *buffer) +{ + signed char pos = (~0UL > 0xfffffffful) ? 60 : 28; + int digits = 0; + int dig; + + do { + dig = in >> pos; + in -= (uint64_t)dig << pos; + pos -= 4; + if (dig || digits || pos < 0) { + if (dig > 9) + dig += 'a' - '0' - 10; + buffer[digits++] = '0' + dig; + } + } while (pos >= 0); + + buffer[digits] = 0; + return digits; +} + +/* converts unsigned long <in> to an hex string using the static itoa_buffer + * and returns the pointer to that string. + */ +static inline __attribute__((unused)) +char *utoh(unsigned long in) +{ + utoh_r(in, itoa_buffer); + return itoa_buffer; +} + +/* Converts the unsigned long integer <in> to its string representation into + * buffer <buffer>, which must be long enough to store the number and the + * trailing zero (21 bytes for 18446744073709551615 in 64-bit, 11 for + * 4294967295 in 32-bit). The buffer is filled from the first byte, and the + * number of characters emitted (not counting the trailing zero) is returned. + * The function is constructed in a way to optimize the code size and avoid + * any divide that could add a dependency on large external functions. + */ +static __attribute__((unused)) +int utoa_r(unsigned long in, char *buffer) +{ + unsigned long lim; + int digits = 0; + int pos = (~0UL > 0xfffffffful) ? 19 : 9; + int dig; + + do { + for (dig = 0, lim = 1; dig < pos; dig++) + lim *= 10; + + if (digits || in >= lim || !pos) { + for (dig = 0; in >= lim; dig++) + in -= lim; + buffer[digits++] = '0' + dig; + } + } while (pos--); + + buffer[digits] = 0; + return digits; +} + +/* Converts the signed long integer <in> to its string representation into + * buffer <buffer>, which must be long enough to store the number and the + * trailing zero (21 bytes for -9223372036854775808 in 64-bit, 12 for + * -2147483648 in 32-bit). The buffer is filled from the first byte, and the + * number of characters emitted (not counting the trailing zero) is returned. + */ +static __attribute__((unused)) +int itoa_r(long in, char *buffer) +{ + char *ptr = buffer; + int len = 0; + + if (in < 0) { + in = -in; + *(ptr++) = '-'; + len++; + } + len += utoa_r(in, ptr); + return len; +} + +/* for historical compatibility, same as above but returns the pointer to the + * buffer. + */ +static inline __attribute__((unused)) +char *ltoa_r(long in, char *buffer) +{ + itoa_r(in, buffer); + return buffer; +} + +/* converts long integer <in> to a string using the static itoa_buffer and + * returns the pointer to that string. + */ +static inline __attribute__((unused)) +char *itoa(long in) +{ + itoa_r(in, itoa_buffer); + return itoa_buffer; +} + +/* converts long integer <in> to a string using the static itoa_buffer and + * returns the pointer to that string. Same as above, for compatibility. + */ +static inline __attribute__((unused)) +char *ltoa(long in) +{ + itoa_r(in, itoa_buffer); + return itoa_buffer; +} + +/* converts unsigned long integer <in> to a string using the static itoa_buffer + * and returns the pointer to that string. + */ +static inline __attribute__((unused)) +char *utoa(unsigned long in) +{ + utoa_r(in, itoa_buffer); + return itoa_buffer; +} + +/* Converts the unsigned 64-bit integer <in> to its hex representation into + * buffer <buffer>, which must be long enough to store the number and the + * trailing zero (17 bytes for "ffffffffffffffff"). The buffer is filled from + * the first byte, and the number of characters emitted (not counting the + * trailing zero) is returned. The function is constructed in a way to optimize + * the code size and avoid any divide that could add a dependency on large + * external functions. + */ +static __attribute__((unused)) +int u64toh_r(uint64_t in, char *buffer) +{ + signed char pos = 60; + int digits = 0; + int dig; + + do { + if (sizeof(long) >= 8) { + dig = (in >> pos) & 0xF; + } else { + /* 32-bit platforms: avoid a 64-bit shift */ + uint32_t d = (pos >= 32) ? (in >> 32) : in; + dig = (d >> (pos & 31)) & 0xF; + } + if (dig > 9) + dig += 'a' - '0' - 10; + pos -= 4; + if (dig || digits || pos < 0) + buffer[digits++] = '0' + dig; + } while (pos >= 0); + + buffer[digits] = 0; + return digits; +} + +/* converts uint64_t <in> to an hex string using the static itoa_buffer and + * returns the pointer to that string. + */ +static inline __attribute__((unused)) +char *u64toh(uint64_t in) +{ + u64toh_r(in, itoa_buffer); + return itoa_buffer; +} + +/* Converts the unsigned 64-bit integer <in> to its string representation into + * buffer <buffer>, which must be long enough to store the number and the + * trailing zero (21 bytes for 18446744073709551615). The buffer is filled from + * the first byte, and the number of characters emitted (not counting the + * trailing zero) is returned. The function is constructed in a way to optimize + * the code size and avoid any divide that could add a dependency on large + * external functions. + */ +static __attribute__((unused)) +int u64toa_r(uint64_t in, char *buffer) +{ + unsigned long long lim; + int digits = 0; + int pos = 19; /* start with the highest possible digit */ + int dig; + + do { + for (dig = 0, lim = 1; dig < pos; dig++) + lim *= 10; + + if (digits || in >= lim || !pos) { + for (dig = 0; in >= lim; dig++) + in -= lim; + buffer[digits++] = '0' + dig; + } + } while (pos--); + + buffer[digits] = 0; + return digits; +} + +/* Converts the signed 64-bit integer <in> to its string representation into + * buffer <buffer>, which must be long enough to store the number and the + * trailing zero (21 bytes for -9223372036854775808). The buffer is filled from + * the first byte, and the number of characters emitted (not counting the + * trailing zero) is returned. + */ +static __attribute__((unused)) +int i64toa_r(int64_t in, char *buffer) +{ + char *ptr = buffer; + int len = 0; + + if (in < 0) { + in = -in; + *(ptr++) = '-'; + len++; + } + len += u64toa_r(in, ptr); + return len; +} + +/* converts int64_t <in> to a string using the static itoa_buffer and returns + * the pointer to that string. + */ +static inline __attribute__((unused)) +char *i64toa(int64_t in) +{ + i64toa_r(in, itoa_buffer); + return itoa_buffer; +} + +/* converts uint64_t <in> to a string using the static itoa_buffer and returns + * the pointer to that string. + */ +static inline __attribute__((unused)) +char *u64toa(uint64_t in) +{ + u64toa_r(in, itoa_buffer); + return itoa_buffer; +} + +#endif /* _NOLIBC_STDLIB_H */ diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h new file mode 100644 index 000000000000..bef35bee9c44 --- /dev/null +++ b/tools/include/nolibc/string.h @@ -0,0 +1,285 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * string function definitions for NOLIBC + * Copyright (C) 2017-2021 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_STRING_H +#define _NOLIBC_STRING_H + +#include "std.h" + +static void *malloc(size_t len); + +/* + * As much as possible, please keep functions alphabetically sorted. + */ + +static __attribute__((unused)) +int memcmp(const void *s1, const void *s2, size_t n) +{ + size_t ofs = 0; + char c1 = 0; + + while (ofs < n && !(c1 = ((char *)s1)[ofs] - ((char *)s2)[ofs])) { + ofs++; + } + return c1; +} + +static __attribute__((unused)) +void *_nolibc_memcpy_up(void *dst, const void *src, size_t len) +{ + size_t pos = 0; + + while (pos < len) { + ((char *)dst)[pos] = ((const char *)src)[pos]; + pos++; + } + return dst; +} + +static __attribute__((unused)) +void *_nolibc_memcpy_down(void *dst, const void *src, size_t len) +{ + while (len) { + len--; + ((char *)dst)[len] = ((const char *)src)[len]; + } + return dst; +} + +/* might be ignored by the compiler without -ffreestanding, then found as + * missing. + */ +__attribute__((weak,unused,section(".text.nolibc_memmove"))) +void *memmove(void *dst, const void *src, size_t len) +{ + size_t dir, pos; + + pos = len; + dir = -1; + + if (dst < src) { + pos = -1; + dir = 1; + } + + while (len) { + pos += dir; + ((char *)dst)[pos] = ((const char *)src)[pos]; + len--; + } + return dst; +} + +/* must be exported, as it's used by libgcc on ARM */ +__attribute__((weak,unused,section(".text.nolibc_memcpy"))) +void *memcpy(void *dst, const void *src, size_t len) +{ + return _nolibc_memcpy_up(dst, src, len); +} + +/* might be ignored by the compiler without -ffreestanding, then found as + * missing. + */ +__attribute__((weak,unused,section(".text.nolibc_memset"))) +void *memset(void *dst, int b, size_t len) +{ + char *p = dst; + + while (len--) + *(p++) = b; + return dst; +} + +static __attribute__((unused)) +char *strchr(const char *s, int c) +{ + while (*s) { + if (*s == (char)c) + return (char *)s; + s++; + } + return NULL; +} + +static __attribute__((unused)) +int strcmp(const char *a, const char *b) +{ + unsigned int c; + int diff; + + while (!(diff = (unsigned char)*a++ - (c = (unsigned char)*b++)) && c) + ; + return diff; +} + +static __attribute__((unused)) +char *strcpy(char *dst, const char *src) +{ + char *ret = dst; + + while ((*dst++ = *src++)); + return ret; +} + +/* this function is only used with arguments that are not constants or when + * it's not known because optimizations are disabled. + */ +static __attribute__((unused)) +size_t nolibc_strlen(const char *str) +{ + size_t len; + + for (len = 0; str[len]; len++); + return len; +} + +/* do not trust __builtin_constant_p() at -O0, as clang will emit a test and + * the two branches, then will rely on an external definition of strlen(). + */ +#if defined(__OPTIMIZE__) +#define strlen(str) ({ \ + __builtin_constant_p((str)) ? \ + __builtin_strlen((str)) : \ + nolibc_strlen((str)); \ +}) +#else +#define strlen(str) nolibc_strlen((str)) +#endif + +static __attribute__((unused)) +size_t strnlen(const char *str, size_t maxlen) +{ + size_t len; + + for (len = 0; (len < maxlen) && str[len]; len++); + return len; +} + +static __attribute__((unused)) +char *strdup(const char *str) +{ + size_t len; + char *ret; + + len = strlen(str); + ret = malloc(len + 1); + if (__builtin_expect(ret != NULL, 1)) + memcpy(ret, str, len + 1); + + return ret; +} + +static __attribute__((unused)) +char *strndup(const char *str, size_t maxlen) +{ + size_t len; + char *ret; + + len = strnlen(str, maxlen); + ret = malloc(len + 1); + if (__builtin_expect(ret != NULL, 1)) { + memcpy(ret, str, len); + ret[len] = '\0'; + } + + return ret; +} + +static __attribute__((unused)) +size_t strlcat(char *dst, const char *src, size_t size) +{ + size_t len; + char c; + + for (len = 0; dst[len]; len++) + ; + + for (;;) { + c = *src; + if (len < size) + dst[len] = c; + if (!c) + break; + len++; + src++; + } + + return len; +} + +static __attribute__((unused)) +size_t strlcpy(char *dst, const char *src, size_t size) +{ + size_t len; + char c; + + for (len = 0;;) { + c = src[len]; + if (len < size) + dst[len] = c; + if (!c) + break; + len++; + } + return len; +} + +static __attribute__((unused)) +char *strncat(char *dst, const char *src, size_t size) +{ + char *orig = dst; + + while (*dst) + dst++; + + while (size && (*dst = *src)) { + src++; + dst++; + size--; + } + + *dst = 0; + return orig; +} + +static __attribute__((unused)) +int strncmp(const char *a, const char *b, size_t size) +{ + unsigned int c; + int diff = 0; + + while (size-- && + !(diff = (unsigned char)*a++ - (c = (unsigned char)*b++)) && c) + ; + + return diff; +} + +static __attribute__((unused)) +char *strncpy(char *dst, const char *src, size_t size) +{ + size_t len; + + for (len = 0; len < size; len++) + if ((dst[len] = *src)) + src++; + return dst; +} + +static __attribute__((unused)) +char *strrchr(const char *s, int c) +{ + const char *ret = NULL; + + while (*s) { + if (*s == (char)c) + ret = s; + s++; + } + return (char *)ret; +} + +#endif /* _NOLIBC_STRING_H */ diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h new file mode 100644 index 000000000000..08491070387b --- /dev/null +++ b/tools/include/nolibc/sys.h @@ -0,0 +1,1247 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Syscall definitions for NOLIBC (those in man(2)) + * Copyright (C) 2017-2021 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_SYS_H +#define _NOLIBC_SYS_H + +#include <stdarg.h> +#include "std.h" + +/* system includes */ +#include <asm/unistd.h> +#include <asm/signal.h> // for SIGCHLD +#include <asm/ioctls.h> +#include <asm/mman.h> +#include <linux/fs.h> +#include <linux/loop.h> +#include <linux/time.h> + +#include "arch.h" +#include "errno.h" +#include "types.h" + + +/* Functions in this file only describe syscalls. They're declared static so + * that the compiler usually decides to inline them while still being allowed + * to pass a pointer to one of their instances. Each syscall exists in two + * versions: + * - the "internal" ones, which matches the raw syscall interface at the + * kernel level, which may sometimes slightly differ from the documented + * libc-level ones. For example most of them return either a valid value + * or -errno. All of these are prefixed with "sys_". They may be called + * by non-portable applications if desired. + * + * - the "exported" ones, whose interface must closely match the one + * documented in man(2), that applications are supposed to expect. These + * ones rely on the internal ones, and set errno. + * + * Each syscall will be defined with the two functions, sorted in alphabetical + * order applied to the exported names. + * + * In case of doubt about the relevance of a function here, only those which + * set errno should be defined here. Wrappers like those appearing in man(3) + * should not be placed here. + */ + + +/* + * int brk(void *addr); + * void *sbrk(intptr_t inc) + */ + +static __attribute__((unused)) +void *sys_brk(void *addr) +{ + return (void *)my_syscall1(__NR_brk, addr); +} + +static __attribute__((unused)) +int brk(void *addr) +{ + void *ret = sys_brk(addr); + + if (!ret) { + SET_ERRNO(ENOMEM); + return -1; + } + return 0; +} + +static __attribute__((unused)) +void *sbrk(intptr_t inc) +{ + void *ret; + + /* first call to find current end */ + if ((ret = sys_brk(0)) && (sys_brk(ret + inc) == ret + inc)) + return ret + inc; + + SET_ERRNO(ENOMEM); + return (void *)-1; +} + + +/* + * int chdir(const char *path); + */ + +static __attribute__((unused)) +int sys_chdir(const char *path) +{ + return my_syscall1(__NR_chdir, path); +} + +static __attribute__((unused)) +int chdir(const char *path) +{ + int ret = sys_chdir(path); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int chmod(const char *path, mode_t mode); + */ + +static __attribute__((unused)) +int sys_chmod(const char *path, mode_t mode) +{ +#ifdef __NR_fchmodat + return my_syscall4(__NR_fchmodat, AT_FDCWD, path, mode, 0); +#elif defined(__NR_chmod) + return my_syscall2(__NR_chmod, path, mode); +#else +#error Neither __NR_fchmodat nor __NR_chmod defined, cannot implement sys_chmod() +#endif +} + +static __attribute__((unused)) +int chmod(const char *path, mode_t mode) +{ + int ret = sys_chmod(path, mode); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int chown(const char *path, uid_t owner, gid_t group); + */ + +static __attribute__((unused)) +int sys_chown(const char *path, uid_t owner, gid_t group) +{ +#ifdef __NR_fchownat + return my_syscall5(__NR_fchownat, AT_FDCWD, path, owner, group, 0); +#elif defined(__NR_chown) + return my_syscall3(__NR_chown, path, owner, group); +#else +#error Neither __NR_fchownat nor __NR_chown defined, cannot implement sys_chown() +#endif +} + +static __attribute__((unused)) +int chown(const char *path, uid_t owner, gid_t group) +{ + int ret = sys_chown(path, owner, group); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int chroot(const char *path); + */ + +static __attribute__((unused)) +int sys_chroot(const char *path) +{ + return my_syscall1(__NR_chroot, path); +} + +static __attribute__((unused)) +int chroot(const char *path) +{ + int ret = sys_chroot(path); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int close(int fd); + */ + +static __attribute__((unused)) +int sys_close(int fd) +{ + return my_syscall1(__NR_close, fd); +} + +static __attribute__((unused)) +int close(int fd) +{ + int ret = sys_close(fd); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int dup(int fd); + */ + +static __attribute__((unused)) +int sys_dup(int fd) +{ + return my_syscall1(__NR_dup, fd); +} + +static __attribute__((unused)) +int dup(int fd) +{ + int ret = sys_dup(fd); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int dup2(int old, int new); + */ + +static __attribute__((unused)) +int sys_dup2(int old, int new) +{ +#ifdef __NR_dup3 + return my_syscall3(__NR_dup3, old, new, 0); +#elif defined(__NR_dup2) + return my_syscall2(__NR_dup2, old, new); +#else +#error Neither __NR_dup3 nor __NR_dup2 defined, cannot implement sys_dup2() +#endif +} + +static __attribute__((unused)) +int dup2(int old, int new) +{ + int ret = sys_dup2(old, new); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int dup3(int old, int new, int flags); + */ + +#ifdef __NR_dup3 +static __attribute__((unused)) +int sys_dup3(int old, int new, int flags) +{ + return my_syscall3(__NR_dup3, old, new, flags); +} + +static __attribute__((unused)) +int dup3(int old, int new, int flags) +{ + int ret = sys_dup3(old, new, flags); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} +#endif + + +/* + * int execve(const char *filename, char *const argv[], char *const envp[]); + */ + +static __attribute__((unused)) +int sys_execve(const char *filename, char *const argv[], char *const envp[]) +{ + return my_syscall3(__NR_execve, filename, argv, envp); +} + +static __attribute__((unused)) +int execve(const char *filename, char *const argv[], char *const envp[]) +{ + int ret = sys_execve(filename, argv, envp); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * void exit(int status); + */ + +static __attribute__((noreturn,unused)) +void sys_exit(int status) +{ + my_syscall1(__NR_exit, status & 255); + while(1); // shut the "noreturn" warnings. +} + +static __attribute__((noreturn,unused)) +void exit(int status) +{ + sys_exit(status); +} + + +/* + * pid_t fork(void); + */ + +static __attribute__((unused)) +pid_t sys_fork(void) +{ +#ifdef __NR_clone + /* note: some archs only have clone() and not fork(). Different archs + * have a different API, but most archs have the flags on first arg and + * will not use the rest with no other flag. + */ + return my_syscall5(__NR_clone, SIGCHLD, 0, 0, 0, 0); +#elif defined(__NR_fork) + return my_syscall0(__NR_fork); +#else +#error Neither __NR_clone nor __NR_fork defined, cannot implement sys_fork() +#endif +} + +static __attribute__((unused)) +pid_t fork(void) +{ + pid_t ret = sys_fork(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int fsync(int fd); + */ + +static __attribute__((unused)) +int sys_fsync(int fd) +{ + return my_syscall1(__NR_fsync, fd); +} + +static __attribute__((unused)) +int fsync(int fd) +{ + int ret = sys_fsync(fd); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int getdents64(int fd, struct linux_dirent64 *dirp, int count); + */ + +static __attribute__((unused)) +int sys_getdents64(int fd, struct linux_dirent64 *dirp, int count) +{ + return my_syscall3(__NR_getdents64, fd, dirp, count); +} + +static __attribute__((unused)) +int getdents64(int fd, struct linux_dirent64 *dirp, int count) +{ + int ret = sys_getdents64(fd, dirp, count); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * pid_t getpgid(pid_t pid); + */ + +static __attribute__((unused)) +pid_t sys_getpgid(pid_t pid) +{ + return my_syscall1(__NR_getpgid, pid); +} + +static __attribute__((unused)) +pid_t getpgid(pid_t pid) +{ + pid_t ret = sys_getpgid(pid); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * pid_t getpgrp(void); + */ + +static __attribute__((unused)) +pid_t sys_getpgrp(void) +{ + return sys_getpgid(0); +} + +static __attribute__((unused)) +pid_t getpgrp(void) +{ + return sys_getpgrp(); +} + + +/* + * pid_t getpid(void); + */ + +static __attribute__((unused)) +pid_t sys_getpid(void) +{ + return my_syscall0(__NR_getpid); +} + +static __attribute__((unused)) +pid_t getpid(void) +{ + return sys_getpid(); +} + + +/* + * pid_t getppid(void); + */ + +static __attribute__((unused)) +pid_t sys_getppid(void) +{ + return my_syscall0(__NR_getppid); +} + +static __attribute__((unused)) +pid_t getppid(void) +{ + return sys_getppid(); +} + + +/* + * pid_t gettid(void); + */ + +static __attribute__((unused)) +pid_t sys_gettid(void) +{ + return my_syscall0(__NR_gettid); +} + +static __attribute__((unused)) +pid_t gettid(void) +{ + return sys_gettid(); +} + + +/* + * int gettimeofday(struct timeval *tv, struct timezone *tz); + */ + +static __attribute__((unused)) +int sys_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + return my_syscall2(__NR_gettimeofday, tv, tz); +} + +static __attribute__((unused)) +int gettimeofday(struct timeval *tv, struct timezone *tz) +{ + int ret = sys_gettimeofday(tv, tz); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int ioctl(int fd, unsigned long req, void *value); + */ + +static __attribute__((unused)) +int sys_ioctl(int fd, unsigned long req, void *value) +{ + return my_syscall3(__NR_ioctl, fd, req, value); +} + +static __attribute__((unused)) +int ioctl(int fd, unsigned long req, void *value) +{ + int ret = sys_ioctl(fd, req, value); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +/* + * int kill(pid_t pid, int signal); + */ + +static __attribute__((unused)) +int sys_kill(pid_t pid, int signal) +{ + return my_syscall2(__NR_kill, pid, signal); +} + +static __attribute__((unused)) +int kill(pid_t pid, int signal) +{ + int ret = sys_kill(pid, signal); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int link(const char *old, const char *new); + */ + +static __attribute__((unused)) +int sys_link(const char *old, const char *new) +{ +#ifdef __NR_linkat + return my_syscall5(__NR_linkat, AT_FDCWD, old, AT_FDCWD, new, 0); +#elif defined(__NR_link) + return my_syscall2(__NR_link, old, new); +#else +#error Neither __NR_linkat nor __NR_link defined, cannot implement sys_link() +#endif +} + +static __attribute__((unused)) +int link(const char *old, const char *new) +{ + int ret = sys_link(old, new); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * off_t lseek(int fd, off_t offset, int whence); + */ + +static __attribute__((unused)) +off_t sys_lseek(int fd, off_t offset, int whence) +{ + return my_syscall3(__NR_lseek, fd, offset, whence); +} + +static __attribute__((unused)) +off_t lseek(int fd, off_t offset, int whence) +{ + off_t ret = sys_lseek(fd, offset, whence); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int mkdir(const char *path, mode_t mode); + */ + +static __attribute__((unused)) +int sys_mkdir(const char *path, mode_t mode) +{ +#ifdef __NR_mkdirat + return my_syscall3(__NR_mkdirat, AT_FDCWD, path, mode); +#elif defined(__NR_mkdir) + return my_syscall2(__NR_mkdir, path, mode); +#else +#error Neither __NR_mkdirat nor __NR_mkdir defined, cannot implement sys_mkdir() +#endif +} + +static __attribute__((unused)) +int mkdir(const char *path, mode_t mode) +{ + int ret = sys_mkdir(path, mode); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int mknod(const char *path, mode_t mode, dev_t dev); + */ + +static __attribute__((unused)) +long sys_mknod(const char *path, mode_t mode, dev_t dev) +{ +#ifdef __NR_mknodat + return my_syscall4(__NR_mknodat, AT_FDCWD, path, mode, dev); +#elif defined(__NR_mknod) + return my_syscall3(__NR_mknod, path, mode, dev); +#else +#error Neither __NR_mknodat nor __NR_mknod defined, cannot implement sys_mknod() +#endif +} + +static __attribute__((unused)) +int mknod(const char *path, mode_t mode, dev_t dev) +{ + int ret = sys_mknod(path, mode, dev); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +#ifndef MAP_SHARED +#define MAP_SHARED 0x01 /* Share changes */ +#define MAP_PRIVATE 0x02 /* Changes are private */ +#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ +#endif + +#ifndef MAP_FAILED +#define MAP_FAILED ((void *)-1) +#endif + +static __attribute__((unused)) +void *sys_mmap(void *addr, size_t length, int prot, int flags, int fd, + off_t offset) +{ +#ifndef my_syscall6 + /* Function not implemented. */ + return -ENOSYS; +#else + + int n; + +#if defined(__i386__) + n = __NR_mmap2; + offset >>= 12; +#else + n = __NR_mmap; +#endif + + return (void *)my_syscall6(n, addr, length, prot, flags, fd, offset); +#endif +} + +static __attribute__((unused)) +void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + void *ret = sys_mmap(addr, length, prot, flags, fd, offset); + + if ((unsigned long)ret >= -4095UL) { + SET_ERRNO(-(long)ret); + ret = MAP_FAILED; + } + return ret; +} + +static __attribute__((unused)) +int sys_munmap(void *addr, size_t length) +{ + return my_syscall2(__NR_munmap, addr, length); +} + +static __attribute__((unused)) +int munmap(void *addr, size_t length) +{ + int ret = sys_munmap(addr, length); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +/* + * int mount(const char *source, const char *target, + * const char *fstype, unsigned long flags, + * const void *data); + */ +static __attribute__((unused)) +int sys_mount(const char *src, const char *tgt, const char *fst, + unsigned long flags, const void *data) +{ + return my_syscall5(__NR_mount, src, tgt, fst, flags, data); +} + +static __attribute__((unused)) +int mount(const char *src, const char *tgt, + const char *fst, unsigned long flags, + const void *data) +{ + int ret = sys_mount(src, tgt, fst, flags, data); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int open(const char *path, int flags[, mode_t mode]); + */ + +static __attribute__((unused)) +int sys_open(const char *path, int flags, mode_t mode) +{ +#ifdef __NR_openat + return my_syscall4(__NR_openat, AT_FDCWD, path, flags, mode); +#elif defined(__NR_open) + return my_syscall3(__NR_open, path, flags, mode); +#else +#error Neither __NR_openat nor __NR_open defined, cannot implement sys_open() +#endif +} + +static __attribute__((unused)) +int open(const char *path, int flags, ...) +{ + mode_t mode = 0; + int ret; + + if (flags & O_CREAT) { + va_list args; + + va_start(args, flags); + mode = va_arg(args, mode_t); + va_end(args); + } + + ret = sys_open(path, flags, mode); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int pivot_root(const char *new, const char *old); + */ + +static __attribute__((unused)) +int sys_pivot_root(const char *new, const char *old) +{ + return my_syscall2(__NR_pivot_root, new, old); +} + +static __attribute__((unused)) +int pivot_root(const char *new, const char *old) +{ + int ret = sys_pivot_root(new, old); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int poll(struct pollfd *fds, int nfds, int timeout); + */ + +static __attribute__((unused)) +int sys_poll(struct pollfd *fds, int nfds, int timeout) +{ +#if defined(__NR_ppoll) + struct timespec t; + + if (timeout >= 0) { + t.tv_sec = timeout / 1000; + t.tv_nsec = (timeout % 1000) * 1000000; + } + return my_syscall4(__NR_ppoll, fds, nfds, (timeout >= 0) ? &t : NULL, NULL); +#elif defined(__NR_poll) + return my_syscall3(__NR_poll, fds, nfds, timeout); +#else +#error Neither __NR_ppoll nor __NR_poll defined, cannot implement sys_poll() +#endif +} + +static __attribute__((unused)) +int poll(struct pollfd *fds, int nfds, int timeout) +{ + int ret = sys_poll(fds, nfds, timeout); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * ssize_t read(int fd, void *buf, size_t count); + */ + +static __attribute__((unused)) +ssize_t sys_read(int fd, void *buf, size_t count) +{ + return my_syscall3(__NR_read, fd, buf, count); +} + +static __attribute__((unused)) +ssize_t read(int fd, void *buf, size_t count) +{ + ssize_t ret = sys_read(fd, buf, count); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int reboot(int cmd); + * <cmd> is among LINUX_REBOOT_CMD_* + */ + +static __attribute__((unused)) +ssize_t sys_reboot(int magic1, int magic2, int cmd, void *arg) +{ + return my_syscall4(__NR_reboot, magic1, magic2, cmd, arg); +} + +static __attribute__((unused)) +int reboot(int cmd) +{ + int ret = sys_reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, cmd, 0); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int sched_yield(void); + */ + +static __attribute__((unused)) +int sys_sched_yield(void) +{ + return my_syscall0(__NR_sched_yield); +} + +static __attribute__((unused)) +int sched_yield(void) +{ + int ret = sys_sched_yield(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int select(int nfds, fd_set *read_fds, fd_set *write_fds, + * fd_set *except_fds, struct timeval *timeout); + */ + +static __attribute__((unused)) +int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) +{ +#if defined(__ARCH_WANT_SYS_OLD_SELECT) && !defined(__NR__newselect) + struct sel_arg_struct { + unsigned long n; + fd_set *r, *w, *e; + struct timeval *t; + } arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout }; + return my_syscall1(__NR_select, &arg); +#elif defined(__ARCH_WANT_SYS_PSELECT6) && defined(__NR_pselect6) + struct timespec t; + + if (timeout) { + t.tv_sec = timeout->tv_sec; + t.tv_nsec = timeout->tv_usec * 1000; + } + return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); +#elif defined(__NR__newselect) || defined(__NR_select) +#ifndef __NR__newselect +#define __NR__newselect __NR_select +#endif + return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout); +#else +#error None of __NR_select, __NR_pselect6, nor __NR__newselect defined, cannot implement sys_select() +#endif +} + +static __attribute__((unused)) +int select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) +{ + int ret = sys_select(nfds, rfds, wfds, efds, timeout); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int setpgid(pid_t pid, pid_t pgid); + */ + +static __attribute__((unused)) +int sys_setpgid(pid_t pid, pid_t pgid) +{ + return my_syscall2(__NR_setpgid, pid, pgid); +} + +static __attribute__((unused)) +int setpgid(pid_t pid, pid_t pgid) +{ + int ret = sys_setpgid(pid, pgid); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * pid_t setsid(void); + */ + +static __attribute__((unused)) +pid_t sys_setsid(void) +{ + return my_syscall0(__NR_setsid); +} + +static __attribute__((unused)) +pid_t setsid(void) +{ + pid_t ret = sys_setsid(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int stat(const char *path, struct stat *buf); + * Warning: the struct stat's layout is arch-dependent. + */ + +static __attribute__((unused)) +int sys_stat(const char *path, struct stat *buf) +{ + struct sys_stat_struct stat; + long ret; + +#ifdef __NR_newfstatat + /* only solution for arm64 */ + ret = my_syscall4(__NR_newfstatat, AT_FDCWD, path, &stat, 0); +#elif defined(__NR_stat) + ret = my_syscall2(__NR_stat, path, &stat); +#else +#error Neither __NR_newfstatat nor __NR_stat defined, cannot implement sys_stat() +#endif + buf->st_dev = stat.st_dev; + buf->st_ino = stat.st_ino; + buf->st_mode = stat.st_mode; + buf->st_nlink = stat.st_nlink; + buf->st_uid = stat.st_uid; + buf->st_gid = stat.st_gid; + buf->st_rdev = stat.st_rdev; + buf->st_size = stat.st_size; + buf->st_blksize = stat.st_blksize; + buf->st_blocks = stat.st_blocks; + buf->st_atime = stat.st_atime; + buf->st_mtime = stat.st_mtime; + buf->st_ctime = stat.st_ctime; + return ret; +} + +static __attribute__((unused)) +int stat(const char *path, struct stat *buf) +{ + int ret = sys_stat(path, buf); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int symlink(const char *old, const char *new); + */ + +static __attribute__((unused)) +int sys_symlink(const char *old, const char *new) +{ +#ifdef __NR_symlinkat + return my_syscall3(__NR_symlinkat, old, AT_FDCWD, new); +#elif defined(__NR_symlink) + return my_syscall2(__NR_symlink, old, new); +#else +#error Neither __NR_symlinkat nor __NR_symlink defined, cannot implement sys_symlink() +#endif +} + +static __attribute__((unused)) +int symlink(const char *old, const char *new) +{ + int ret = sys_symlink(old, new); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * mode_t umask(mode_t mode); + */ + +static __attribute__((unused)) +mode_t sys_umask(mode_t mode) +{ + return my_syscall1(__NR_umask, mode); +} + +static __attribute__((unused)) +mode_t umask(mode_t mode) +{ + return sys_umask(mode); +} + + +/* + * int umount2(const char *path, int flags); + */ + +static __attribute__((unused)) +int sys_umount2(const char *path, int flags) +{ + return my_syscall2(__NR_umount2, path, flags); +} + +static __attribute__((unused)) +int umount2(const char *path, int flags) +{ + int ret = sys_umount2(path, flags); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * int unlink(const char *path); + */ + +static __attribute__((unused)) +int sys_unlink(const char *path) +{ +#ifdef __NR_unlinkat + return my_syscall3(__NR_unlinkat, AT_FDCWD, path, 0); +#elif defined(__NR_unlink) + return my_syscall1(__NR_unlink, path); +#else +#error Neither __NR_unlinkat nor __NR_unlink defined, cannot implement sys_unlink() +#endif +} + +static __attribute__((unused)) +int unlink(const char *path) +{ + int ret = sys_unlink(path); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * pid_t wait(int *status); + * pid_t wait4(pid_t pid, int *status, int options, struct rusage *rusage); + * pid_t waitpid(pid_t pid, int *status, int options); + */ + +static __attribute__((unused)) +pid_t sys_wait4(pid_t pid, int *status, int options, struct rusage *rusage) +{ + return my_syscall4(__NR_wait4, pid, status, options, rusage); +} + +static __attribute__((unused)) +pid_t wait(int *status) +{ + pid_t ret = sys_wait4(-1, status, 0, NULL); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t wait4(pid_t pid, int *status, int options, struct rusage *rusage) +{ + pid_t ret = sys_wait4(pid, status, options, rusage); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +static __attribute__((unused)) +pid_t waitpid(pid_t pid, int *status, int options) +{ + pid_t ret = sys_wait4(pid, status, options, NULL); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +/* + * ssize_t write(int fd, const void *buf, size_t count); + */ + +static __attribute__((unused)) +ssize_t sys_write(int fd, const void *buf, size_t count) +{ + return my_syscall3(__NR_write, fd, buf, count); +} + +static __attribute__((unused)) +ssize_t write(int fd, const void *buf, size_t count) +{ + ssize_t ret = sys_write(fd, buf, count); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + + +#endif /* _NOLIBC_SYS_H */ diff --git a/tools/include/nolibc/time.h b/tools/include/nolibc/time.h new file mode 100644 index 000000000000..d18b7661fdd7 --- /dev/null +++ b/tools/include/nolibc/time.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * time function definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_TIME_H +#define _NOLIBC_TIME_H + +#include "std.h" +#include "arch.h" +#include "types.h" +#include "sys.h" + +static __attribute__((unused)) +time_t time(time_t *tptr) +{ + struct timeval tv; + + /* note, cannot fail here */ + sys_gettimeofday(&tv, NULL); + + if (tptr) + *tptr = tv.tv_sec; + return tv.tv_sec; +} + +#endif /* _NOLIBC_TIME_H */ diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h new file mode 100644 index 000000000000..959997034e55 --- /dev/null +++ b/tools/include/nolibc/types.h @@ -0,0 +1,205 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Special types used by various syscalls for NOLIBC + * Copyright (C) 2017-2021 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_TYPES_H +#define _NOLIBC_TYPES_H + +#include "std.h" +#include <linux/time.h> + + +/* Only the generic macros and types may be defined here. The arch-specific + * ones such as the O_RDONLY and related macros used by fcntl() and open(), or + * the layout of sys_stat_struct must not be defined here. + */ + +/* stat flags (WARNING, octal here) */ +#define S_IFDIR 0040000 +#define S_IFCHR 0020000 +#define S_IFBLK 0060000 +#define S_IFREG 0100000 +#define S_IFIFO 0010000 +#define S_IFLNK 0120000 +#define S_IFSOCK 0140000 +#define S_IFMT 0170000 + +#define S_ISDIR(mode) (((mode) & S_IFDIR) == S_IFDIR) +#define S_ISCHR(mode) (((mode) & S_IFCHR) == S_IFCHR) +#define S_ISBLK(mode) (((mode) & S_IFBLK) == S_IFBLK) +#define S_ISREG(mode) (((mode) & S_IFREG) == S_IFREG) +#define S_ISFIFO(mode) (((mode) & S_IFIFO) == S_IFIFO) +#define S_ISLNK(mode) (((mode) & S_IFLNK) == S_IFLNK) +#define S_ISSOCK(mode) (((mode) & S_IFSOCK) == S_IFSOCK) + +/* dirent types */ +#define DT_UNKNOWN 0x0 +#define DT_FIFO 0x1 +#define DT_CHR 0x2 +#define DT_DIR 0x4 +#define DT_BLK 0x6 +#define DT_REG 0x8 +#define DT_LNK 0xa +#define DT_SOCK 0xc + +/* commonly an fd_set represents 256 FDs */ +#ifndef FD_SETSIZE +#define FD_SETSIZE 256 +#endif + +/* PATH_MAX and MAXPATHLEN are often used and found with plenty of different + * values. + */ +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif + +#ifndef MAXPATHLEN +#define MAXPATHLEN (PATH_MAX) +#endif + +/* Special FD used by all the *at functions */ +#ifndef AT_FDCWD +#define AT_FDCWD (-100) +#endif + +/* whence values for lseek() */ +#define SEEK_SET 0 +#define SEEK_CUR 1 +#define SEEK_END 2 + +/* cmd for reboot() */ +#define LINUX_REBOOT_MAGIC1 0xfee1dead +#define LINUX_REBOOT_MAGIC2 0x28121969 +#define LINUX_REBOOT_CMD_HALT 0xcdef0123 +#define LINUX_REBOOT_CMD_POWER_OFF 0x4321fedc +#define LINUX_REBOOT_CMD_RESTART 0x01234567 +#define LINUX_REBOOT_CMD_SW_SUSPEND 0xd000fce2 + +/* Macros used on waitpid()'s return status */ +#define WEXITSTATUS(status) (((status) & 0xff00) >> 8) +#define WIFEXITED(status) (((status) & 0x7f) == 0) + +/* waitpid() flags */ +#define WNOHANG 1 + +/* standard exit() codes */ +#define EXIT_SUCCESS 0 +#define EXIT_FAILURE 1 + +/* for select() */ +typedef struct { + uint32_t fd32[(FD_SETSIZE + 31) / 32]; +} fd_set; + +#define FD_CLR(fd, set) do { \ + fd_set *__set = (set); \ + int __fd = (fd); \ + if (__fd >= 0) \ + __set->fd32[__fd / 32] &= ~(1U << (__fd & 31)); \ + } while (0) + +#define FD_SET(fd, set) do { \ + fd_set *__set = (set); \ + int __fd = (fd); \ + if (__fd >= 0) \ + __set->fd32[__fd / 32] |= 1U << (__fd & 31); \ + } while (0) + +#define FD_ISSET(fd, set) ({ \ + fd_set *__set = (set); \ + int __fd = (fd); \ + int __r = 0; \ + if (__fd >= 0) \ + __r = !!(__set->fd32[__fd / 32] & 1U << (__fd & 31)); \ + __r; \ + }) + +#define FD_ZERO(set) do { \ + fd_set *__set = (set); \ + int __idx; \ + for (__idx = 0; __idx < (FD_SETSIZE+31) / 32; __idx ++) \ + __set->fd32[__idx] = 0; \ + } while (0) + +/* for poll() */ +#define POLLIN 0x0001 +#define POLLPRI 0x0002 +#define POLLOUT 0x0004 +#define POLLERR 0x0008 +#define POLLHUP 0x0010 +#define POLLNVAL 0x0020 + +struct pollfd { + int fd; + short int events; + short int revents; +}; + +/* for getdents64() */ +struct linux_dirent64 { + uint64_t d_ino; + int64_t d_off; + unsigned short d_reclen; + unsigned char d_type; + char d_name[]; +}; + +/* needed by wait4() */ +struct rusage { + struct timeval ru_utime; + struct timeval ru_stime; + long ru_maxrss; + long ru_ixrss; + long ru_idrss; + long ru_isrss; + long ru_minflt; + long ru_majflt; + long ru_nswap; + long ru_inblock; + long ru_oublock; + long ru_msgsnd; + long ru_msgrcv; + long ru_nsignals; + long ru_nvcsw; + long ru_nivcsw; +}; + +/* The format of the struct as returned by the libc to the application, which + * significantly differs from the format returned by the stat() syscall flavours. + */ +struct stat { + dev_t st_dev; /* ID of device containing file */ + ino_t st_ino; /* inode number */ + mode_t st_mode; /* protection */ + nlink_t st_nlink; /* number of hard links */ + uid_t st_uid; /* user ID of owner */ + gid_t st_gid; /* group ID of owner */ + dev_t st_rdev; /* device ID (if special file) */ + off_t st_size; /* total size, in bytes */ + blksize_t st_blksize; /* blocksize for file system I/O */ + blkcnt_t st_blocks; /* number of 512B blocks allocated */ + time_t st_atime; /* time of last access */ + time_t st_mtime; /* time of last modification */ + time_t st_ctime; /* time of last status change */ +}; + +/* WARNING, it only deals with the 4096 first majors and 256 first minors */ +#define makedev(major, minor) ((dev_t)((((major) & 0xfff) << 8) | ((minor) & 0xff))) +#define major(dev) ((unsigned int)(((dev) >> 8) & 0xfff)) +#define minor(dev) ((unsigned int)(((dev) & 0xff)) + +#ifndef offsetof +#define offsetof(TYPE, FIELD) ((size_t) &((TYPE *)0)->FIELD) +#endif + +#ifndef container_of +#define container_of(PTR, TYPE, FIELD) ({ \ + __typeof__(((TYPE *)0)->FIELD) *__FIELD_PTR = (PTR); \ + (TYPE *)((char *) __FIELD_PTR - offsetof(TYPE, FIELD)); \ +}) +#endif + +#endif /* _NOLIBC_TYPES_H */ diff --git a/tools/include/nolibc/unistd.h b/tools/include/nolibc/unistd.h new file mode 100644 index 000000000000..1c25e20ee360 --- /dev/null +++ b/tools/include/nolibc/unistd.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * unistd function definitions for NOLIBC + * Copyright (C) 2017-2022 Willy Tarreau <w@1wt.eu> + */ + +#ifndef _NOLIBC_UNISTD_H +#define _NOLIBC_UNISTD_H + +#include "std.h" +#include "arch.h" +#include "types.h" +#include "sys.h" + + +static __attribute__((unused)) +int msleep(unsigned int msecs) +{ + struct timeval my_timeval = { msecs / 1000, (msecs % 1000) * 1000 }; + + if (sys_select(0, 0, 0, 0, &my_timeval) < 0) + return (my_timeval.tv_sec * 1000) + + (my_timeval.tv_usec / 1000) + + !!(my_timeval.tv_usec % 1000); + else + return 0; +} + +static __attribute__((unused)) +unsigned int sleep(unsigned int seconds) +{ + struct timeval my_timeval = { seconds, 0 }; + + if (sys_select(0, 0, 0, 0, &my_timeval) < 0) + return my_timeval.tv_sec + !!my_timeval.tv_usec; + else + return 0; +} + +static __attribute__((unused)) +int usleep(unsigned int usecs) +{ + struct timeval my_timeval = { usecs / 1000000, usecs % 1000000 }; + + return sys_select(0, 0, 0, 0, &my_timeval); +} + +static __attribute__((unused)) +int tcsetpgrp(int fd, pid_t pid) +{ + return ioctl(fd, TIOCSPGRP, &pid); +} + +#endif /* _NOLIBC_UNISTD_H */ diff --git a/tools/memory-model/README b/tools/memory-model/README index 9edd402704c4..dab38904206a 100644 --- a/tools/memory-model/README +++ b/tools/memory-model/README @@ -54,7 +54,8 @@ klitmus7 Compatibility Table -- 4.14 7.48 -- 4.15 -- 4.19 7.49 -- 4.20 -- 5.5 7.54 -- - 5.6 -- 7.56 -- + 5.6 -- 5.16 7.56 -- + 5.17 -- 7.56.1 -- ============ ========== diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index f3bf9297bcc0..1bd64e7404b9 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -553,9 +553,16 @@ ifndef NO_LIBELF ifeq ($(feature-libbpf), 1) EXTLIBS += -lbpf $(call detected,CONFIG_LIBBPF_DYNAMIC) + + $(call feature_check,libbpf-btf__load_from_kernel_by_id) + ifeq ($(feature-libbpf-btf__load_from_kernel_by_id), 1) + CFLAGS += -DHAVE_LIBBPF_BTF__LOAD_FROM_KERNEL_BY_ID + endif else dummy := $(error Error: No libbpf devel library found, please install libbpf-devel); endif + else + CFLAGS += -DHAVE_LIBBPF_BTF__LOAD_FROM_KERNEL_BY_ID endif endif diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c index 207c56805c55..0ed177991ad0 100644 --- a/tools/perf/arch/x86/util/perf_regs.c +++ b/tools/perf/arch/x86/util/perf_regs.c @@ -9,6 +9,8 @@ #include "../../../util/perf_regs.h" #include "../../../util/debug.h" #include "../../../util/event.h" +#include "../../../util/pmu.h" +#include "../../../util/pmu-hybrid.h" const struct sample_reg sample_reg_masks[] = { SMPL_REG(AX, PERF_REG_X86_AX), @@ -284,12 +286,22 @@ uint64_t arch__intr_reg_mask(void) .disabled = 1, .exclude_kernel = 1, }; + struct perf_pmu *pmu; int fd; /* * In an unnamed union, init it here to build on older gcc versions */ attr.sample_period = 1; + if (perf_pmu__has_hybrid()) { + /* + * The same register set is supported among different hybrid PMUs. + * Only check the first available one. + */ + pmu = list_first_entry(&perf_pmu__hybrid_pmus, typeof(*pmu), hybrid_list); + attr.config |= (__u64)pmu->type << PERF_PMU_TYPE_SHIFT; + } + event_attr_init(&attr); fd = sys_perf_event_open(&attr, 0, -1, -1, 0); diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index d5289fa58a4f..20eed1e53f80 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c @@ -1740,7 +1740,7 @@ static int __bench_numa(const char *name) "GB/sec,", "total-speed", "GB/sec total speed"); if (g->p.show_details >= 2) { - char tname[14 + 2 * 10 + 1]; + char tname[14 + 2 * 11 + 1]; struct thread_data *td; for (p = 0; p < g->p.nr_proc; p++) { for (t = 0; t < g->p.nr_threads; t++) { diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c index 57b9591f7cbb..17c023823713 100644 --- a/tools/perf/tests/bpf.c +++ b/tools/perf/tests/bpf.c @@ -222,11 +222,11 @@ static int __test__bpf(int idx) ret = test_llvm__fetch_bpf_obj(&obj_buf, &obj_buf_sz, bpf_testcase_table[idx].prog_id, - true, NULL); + false, NULL); if (ret != TEST_OK || !obj_buf || !obj_buf_sz) { pr_debug("Unable to get BPF object, %s\n", bpf_testcase_table[idx].msg_compile_fail); - if (idx == 0) + if ((idx == 0) || (ret == TEST_SKIP)) return TEST_SKIP; else return TEST_FAIL; @@ -364,9 +364,11 @@ static int test__bpf_prologue_test(struct test_suite *test __maybe_unused, static struct test_case bpf_tests[] = { #ifdef HAVE_LIBBPF_SUPPORT TEST_CASE("Basic BPF filtering", basic_bpf_test), - TEST_CASE("BPF pinning", bpf_pinning), + TEST_CASE_REASON("BPF pinning", bpf_pinning, + "clang isn't installed or environment missing BPF support"), #ifdef HAVE_BPF_PROLOGUE - TEST_CASE("BPF prologue generation", bpf_prologue_test), + TEST_CASE_REASON("BPF prologue generation", bpf_prologue_test, + "clang isn't installed or environment missing BPF support"), #else TEST_CASE_REASON("BPF prologue generation", bpf_prologue_test, "not compiled in"), #endif diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index fac3717d9ba1..d336cda94a11 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -279,6 +279,7 @@ static const char *shell_test__description(char *description, size_t size, { FILE *fp; char filename[PATH_MAX]; + int ch; path__join(filename, sizeof(filename), path, name); fp = fopen(filename, "r"); @@ -286,7 +287,9 @@ static const char *shell_test__description(char *description, size_t size, return NULL; /* Skip shebang */ - while (fgetc(fp) != '\n'); + do { + ch = fgetc(fp); + } while (ch != EOF && ch != '\n'); description = fgets(description, size, fp); fclose(fp); @@ -417,7 +420,8 @@ static int run_shell_tests(int argc, const char *argv[], int i, int width, .priv = &st, }; - if (!perf_test__matches(test_suite.desc, curr, argc, argv)) + if (test_suite.desc == NULL || + !perf_test__matches(test_suite.desc, curr, argc, argv)) continue; st.file = ent->d_name; diff --git a/tools/perf/tests/shell/stat_all_pmu.sh b/tools/perf/tests/shell/stat_all_pmu.sh index b30dba455f36..9c9ef33e0b3c 100755 --- a/tools/perf/tests/shell/stat_all_pmu.sh +++ b/tools/perf/tests/shell/stat_all_pmu.sh @@ -5,6 +5,16 @@ set -e for p in $(perf list --raw-dump pmu); do + # In powerpc, skip the events for hv_24x7 and hv_gpci. + # These events needs input values to be filled in for + # core, chip, partition id based on system. + # Example: hv_24x7/CPM_ADJUNCT_INST,domain=?,core=?/ + # hv_gpci/event,partition_id=?/ + # Hence skip these events for ppc. + if echo "$p" |grep -Eq 'hv_24x7|hv_gpci' ; then + echo "Skipping: Event '$p' in powerpc" + continue + fi echo "Testing $p" result=$(perf stat -e "$p" true 2>&1) if ! echo "$result" | grep -q "$p" && ! echo "$result" | grep -q "<not supported>" ; then diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index ee1e3dcbc0bd..d23a9e322ff5 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -109,6 +109,17 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) && strncmp(session->header.env.arch, "aarch64", 7)) return TEST_SKIP; + /* + * In powerpc pSeries platform, not all the topology information + * are exposed via sysfs. Due to restriction, detail like + * physical_package_id will be set to -1. Hence skip this + * test if physical_package_id returns -1 for cpu from perf_cpu_map. + */ + if (strncmp(session->header.env.arch, "powerpc", 7)) { + if (cpu__get_socket_id(perf_cpu_map__cpu(map, 0)) == -1) + return TEST_SKIP; + } + TEST_ASSERT_VAL("Session header CPU map not set", session->header.env.cpu); for (i = 0; i < session->header.env.nr_cpus_avail; i++) { diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 94624733af7e..8271ab764eb5 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -22,7 +22,8 @@ #include "record.h" #include "util/synthetic-events.h" -struct btf * __weak btf__load_from_kernel_by_id(__u32 id) +#ifndef HAVE_LIBBPF_BTF__LOAD_FROM_KERNEL_BY_ID +struct btf *btf__load_from_kernel_by_id(__u32 id) { struct btf *btf; #pragma GCC diagnostic push @@ -32,6 +33,7 @@ struct btf * __weak btf__load_from_kernel_by_id(__u32 id) return err ? ERR_PTR(err) : btf; } +#endif int __weak bpf_prog_load(enum bpf_prog_type prog_type, const char *prog_name __maybe_unused, diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index f9a320694b85..a7f93f5a1ac8 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1151,9 +1151,20 @@ static void branch_stack__printf(struct perf_sample *sample, bool callstack) struct branch_entry *entries = perf_sample__branch_entries(sample); uint64_t i; - printf("%s: nr:%" PRIu64 "\n", - !callstack ? "... branch stack" : "... branch callstack", - sample->branch_stack->nr); + if (!callstack) { + printf("%s: nr:%" PRIu64 "\n", "... branch stack", sample->branch_stack->nr); + } else { + /* the reason of adding 1 to nr is because after expanding + * branch stack it generates nr + 1 callstack records. e.g., + * B()->C() + * A()->B() + * the final callstack should be: + * C() + * B() + * A() + */ + printf("%s: nr:%" PRIu64 "\n", "... branch callstack", sample->branch_stack->nr+1); + } for (i = 0; i < sample->branch_stack->nr; i++) { struct branch_entry *e = &entries[i]; @@ -1169,8 +1180,13 @@ static void branch_stack__printf(struct perf_sample *sample, bool callstack) (unsigned)e->flags.reserved, e->flags.type ? branch_type_name(e->flags.type) : ""); } else { - printf("..... %2"PRIu64": %016" PRIx64 "\n", - i, i > 0 ? e->from : e->to); + if (i == 0) { + printf("..... %2"PRIu64": %016" PRIx64 "\n" + "..... %2"PRIu64": %016" PRIx64 "\n", + i, e->to, i+1, e->from); + } else { + printf("..... %2"PRIu64": %016" PRIx64 "\n", i+1, e->from); + } } } } diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 817a2de264b4..c1af37e11f98 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -472,9 +472,10 @@ int perf_stat_process_counter(struct perf_stat_config *config, int perf_event__process_stat_event(struct perf_session *session, union perf_event *event) { - struct perf_counts_values count; + struct perf_counts_values count, *ptr; struct perf_record_stat *st = &event->stat; struct evsel *counter; + int cpu_map_idx; count.val = st->val; count.ena = st->ena; @@ -485,8 +486,18 @@ int perf_event__process_stat_event(struct perf_session *session, pr_err("Failed to resolve counter for stat event.\n"); return -EINVAL; } - - *perf_counts(counter->counts, st->cpu, st->thread) = count; + cpu_map_idx = perf_cpu_map__idx(evsel__cpus(counter), (struct perf_cpu){.cpu = st->cpu}); + if (cpu_map_idx == -1) { + pr_err("Invalid CPU %d for event %s.\n", st->cpu, evsel__name(counter)); + return -EINVAL; + } + ptr = perf_counts(counter->counts, cpu_map_idx, st->thread); + if (ptr == NULL) { + pr_err("Failed to find perf count for CPU %d thread %d on event %s.\n", + st->cpu, st->thread, evsel__name(counter)); + return -EINVAL; + } + *ptr = count; counter->supported = true; return 0; } diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 0d06ffa95d9d..93d77574b255 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -208,7 +208,7 @@ static bool sanity_check_pmu(struct kvm_vm *vm) return success; } -static struct kvm_pmu_event_filter *make_pmu_event_filter(uint32_t nevents) +static struct kvm_pmu_event_filter *alloc_pmu_event_filter(uint32_t nevents) { struct kvm_pmu_event_filter *f; int size = sizeof(*f) + nevents * sizeof(f->events[0]); @@ -220,19 +220,29 @@ static struct kvm_pmu_event_filter *make_pmu_event_filter(uint32_t nevents) return f; } -static struct kvm_pmu_event_filter *event_filter(uint32_t action) + +static struct kvm_pmu_event_filter * +create_pmu_event_filter(const uint64_t event_list[], + int nevents, uint32_t action) { struct kvm_pmu_event_filter *f; int i; - f = make_pmu_event_filter(ARRAY_SIZE(event_list)); + f = alloc_pmu_event_filter(nevents); f->action = action; - for (i = 0; i < ARRAY_SIZE(event_list); i++) + for (i = 0; i < nevents; i++) f->events[i] = event_list[i]; return f; } +static struct kvm_pmu_event_filter *event_filter(uint32_t action) +{ + return create_pmu_event_filter(event_list, + ARRAY_SIZE(event_list), + action); +} + /* * Remove the first occurrence of 'event' (if any) from the filter's * event list. @@ -271,6 +281,22 @@ static uint64_t test_with_filter(struct kvm_vm *vm, return run_vm_to_sync(vm); } +static void test_amd_deny_list(struct kvm_vm *vm) +{ + uint64_t event = EVENT(0x1C2, 0); + struct kvm_pmu_event_filter *f; + uint64_t count; + + f = create_pmu_event_filter(&event, 1, KVM_PMU_EVENT_DENY); + count = test_with_filter(vm, f); + + free(f); + if (count != NUM_BRANCHES) + pr_info("%s: Branch instructions retired = %lu (expected %u)\n", + __func__, count, NUM_BRANCHES); + TEST_ASSERT(count, "Allowed PMU event is not counting"); +} + static void test_member_deny_list(struct kvm_vm *vm) { struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY); @@ -453,6 +479,9 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } + if (use_amd_pmu()) + test_amd_deny_list(vm); + test_without_filter(vm); test_member_deny_list(vm); test_member_allow_list(vm); diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index c87e674b61b1..e811090f7748 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -86,7 +86,7 @@ TEST_PROGS = bridge_igmp.sh \ vxlan_bridge_1d_port_8472.sh \ vxlan_bridge_1d.sh \ vxlan_bridge_1q_ipv6.sh \ - vxlan_bridge_1q_port_8472_ipv6.sh + vxlan_bridge_1q_port_8472_ipv6.sh \ vxlan_bridge_1q_port_8472.sh \ vxlan_bridge_1q.sh \ vxlan_symmetric_ipv6.sh \ diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 7314257d248a..48ef112f42c2 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -1444,6 +1444,33 @@ chk_prio_nr() [ "${dump_stats}" = 1 ] && dump_stats } +chk_subflow_nr() +{ + local need_title="$1" + local msg="$2" + local subflow_nr=$3 + local cnt1 + local cnt2 + + if [ -n "${need_title}" ]; then + printf "%03u %-36s %s" "${TEST_COUNT}" "${TEST_NAME}" "${msg}" + else + printf "%-${nr_blank}s %s" " " "${msg}" + fi + + cnt1=$(ss -N $ns1 -tOni | grep -c token) + cnt2=$(ss -N $ns2 -tOni | grep -c token) + if [ "$cnt1" != "$subflow_nr" -o "$cnt2" != "$subflow_nr" ]; then + echo "[fail] got $cnt1:$cnt2 subflows expected $subflow_nr" + fail_test + dump_stats=1 + else + echo "[ ok ]" + fi + + [ "${dump_stats}" = 1 ] && ( ss -N $ns1 -tOni ; ss -N $ns1 -tOni | grep token; ip -n $ns1 mptcp endpoint ) +} + chk_link_usage() { local ns=$1 @@ -2556,7 +2583,7 @@ fastclose_tests() fi } -implicit_tests() +endpoint_tests() { # userspace pm type prevents add_addr if reset "implicit EP"; then @@ -2578,6 +2605,23 @@ implicit_tests() $ns2 10.0.2.2 id 1 flags signal wait fi + + if reset "delete and re-add"; then + pm_nl_set_limits $ns1 1 1 + pm_nl_set_limits $ns2 1 1 + pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow + run_tests $ns1 $ns2 10.0.1.1 4 0 0 slow & + + wait_mpj $ns2 + pm_nl_del_endpoint $ns2 2 10.0.2.2 + sleep 0.5 + chk_subflow_nr needtitle "after delete" 1 + + pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow + wait_mpj $ns2 + chk_subflow_nr "" "after re-add" 2 + wait + fi } # [$1: error message] @@ -2624,7 +2668,7 @@ all_tests_sorted=( d@deny_join_id0_tests m@fullmesh_tests z@fastclose_tests - I@implicit_tests + I@endpoint_tests ) all_tests_args="" diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index c35ba24f994c..66d0414d8e4b 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -301,7 +301,7 @@ specify_qemu_cpus () { echo $2 -smp $3 ;; qemu-system-ppc64) - nt="`lscpu | grep '^NUMA node0' | sed -e 's/^[^,]*,\([0-9]*\),.*$/\1/'`" + nt="`lscpu | sed -n 's/^Thread(s) per core:\s*//p'`" echo $2 -smp cores=`expr \( $3 + $nt - 1 \) / $nt`,threads=$nt ;; esac diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh index 5f682fc892dd..88983cba7956 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh @@ -36,7 +36,7 @@ do then egrep "error:|warning:|^ld: .*undefined reference to" < $i > $i.diags files="$files $i.diags $i" - elif ! test -f ${scenariobasedir}/vmlinux + elif ! test -f ${scenariobasedir}/vmlinux && ! test -f "${rundir}/re-run" then echo No ${scenariobasedir}/vmlinux file > $i.diags files="$files $i.diags $i" diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index 0a5419982ab3..0789c5606d2a 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -33,7 +33,12 @@ do TORTURE_SUITE="`cat $i/../torture_suite`" configfile=`echo $i | sed -e 's,^.*/,,'` rm -f $i/console.log.*.diags - kvm-recheck-${TORTURE_SUITE}.sh $i + case "${TORTURE_SUITE}" in + X*) + ;; + *) + kvm-recheck-${TORTURE_SUITE}.sh $i + esac if test -f "$i/qemu-retval" && test "`cat $i/qemu-retval`" -ne 0 && test "`cat $i/qemu-retval`" -ne 137 then echo QEMU error, output: diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh index 8c4c1e4792d0..0ff59bd8b640 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh @@ -138,14 +138,14 @@ chmod +x $T/bin/kvm-remote-*.sh # Check first to avoid the need for cleanup for system-name typos for i in $systems do - ncpus="`ssh $i getconf _NPROCESSORS_ONLN 2> /dev/null`" - echo $i: $ncpus CPUs " " `date` | tee -a "$oldrun/remote-log" + ncpus="`ssh -o BatchMode=yes $i getconf _NPROCESSORS_ONLN 2> /dev/null`" ret=$? if test "$ret" -ne 0 then echo System $i unreachable, giving up. | tee -a "$oldrun/remote-log" exit 4 fi + echo $i: $ncpus CPUs " " `date` | tee -a "$oldrun/remote-log" done # Download and expand the tarball on all systems. @@ -153,14 +153,14 @@ echo Build-products tarball: `du -h $T/binres.tgz` | tee -a "$oldrun/remote-log" for i in $systems do echo Downloading tarball to $i `date` | tee -a "$oldrun/remote-log" - cat $T/binres.tgz | ssh $i "cd /tmp; tar -xzf -" + cat $T/binres.tgz | ssh -o BatchMode=yes $i "cd /tmp; tar -xzf -" ret=$? tries=0 while test "$ret" -ne 0 do echo Unable to download $T/binres.tgz to system $i, waiting and then retrying. $tries prior retries. | tee -a "$oldrun/remote-log" sleep 60 - cat $T/binres.tgz | ssh $i "cd /tmp; tar -xzf -" + cat $T/binres.tgz | ssh -o BatchMode=yes $i "cd /tmp; tar -xzf -" ret=$? if test "$ret" -ne 0 then @@ -185,7 +185,7 @@ checkremotefile () { while : do - ssh $1 "test -f \"$2\"" + ssh -o BatchMode=yes $1 "test -f \"$2\"" ret=$? if test "$ret" -eq 255 then @@ -228,7 +228,7 @@ startbatches () { then continue # System still running last test, skip. fi - ssh "$i" "cd \"$resdir/$ds\"; touch remote.run; PATH=\"$T/bin:$PATH\" nohup kvm-remote-$curbatch.sh > kvm-remote-$curbatch.sh.out 2>&1 &" 1>&2 + ssh -o BatchMode=yes "$i" "cd \"$resdir/$ds\"; touch remote.run; PATH=\"$T/bin:$PATH\" nohup kvm-remote-$curbatch.sh > kvm-remote-$curbatch.sh.out 2>&1 &" 1>&2 ret=$? if test "$ret" -ne 0 then @@ -267,7 +267,7 @@ do sleep 30 done echo " ---" Collecting results from $i `date` | tee -a "$oldrun/remote-log" - ( cd "$oldrun"; ssh $i "cd $rundir; tar -czf - kvm-remote-*.sh.out */console.log */kvm-test-1-run*.sh.out */qemu[_-]pid */qemu-retval */qemu-affinity; rm -rf $T > /dev/null 2>&1" | tar -xzf - ) + ( cd "$oldrun"; ssh -o BatchMode=yes $i "cd $rundir; tar -czf - kvm-remote-*.sh.out */console.log */kvm-test-1-run*.sh.out */qemu[_-]pid */qemu-retval */qemu-affinity; rm -rf $T > /dev/null 2>&1" | tar -xzf - ) done ( kvm-end-run-stats.sh "$oldrun" "$starttime"; echo $? > $T/exitcode ) | tee -a "$oldrun/remote-log" diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 55b2c1533282..263e16aeca0e 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -44,6 +44,7 @@ TORTURE_KCONFIG_KASAN_ARG="" TORTURE_KCONFIG_KCSAN_ARG="" TORTURE_KMAKE_ARG="" TORTURE_QEMU_MEM=512 +torture_qemu_mem_default=1 TORTURE_REMOTE= TORTURE_SHUTDOWN_GRACE=180 TORTURE_SUITE=rcu @@ -86,7 +87,7 @@ usage () { echo " --remote" echo " --results absolute-pathname" echo " --shutdown-grace seconds" - echo " --torture lock|rcu|rcuscale|refscale|scf" + echo " --torture lock|rcu|rcuscale|refscale|scf|X*" echo " --trust-make" exit 1 } @@ -180,6 +181,10 @@ do ;; --kasan) TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG + if test -n "$torture_qemu_mem_default" + then + TORTURE_QEMU_MEM=2G + fi ;; --kconfig|--kconfigs) checkarg --kconfig "(Kconfig options)" $# "$2" '^CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\( CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\)*$' '^error$' @@ -202,6 +207,7 @@ do --memory) checkarg --memory "(memory size)" $# "$2" '^[0-9]\+[MG]\?$' error TORTURE_QEMU_MEM=$2 + torture_qemu_mem_default= shift ;; --no-initrd) @@ -231,7 +237,7 @@ do shift ;; --torture) - checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuscale\|refscale\|scf\)$' '^--' + checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuscale\|refscale\|scf\|X.*\)$' '^--' TORTURE_SUITE=$2 TORTURE_MOD="`echo $TORTURE_SUITE | sed -e 's/^\(lock\|rcu\|scf\)$/\1torture/'`" shift diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index bfe09e2829c8..d477618e7261 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -54,6 +54,7 @@ do_kvfree=yes do_kasan=yes do_kcsan=no do_clocksourcewd=yes +do_rt=yes # doyesno - Helper function for yes/no arguments function doyesno () { @@ -82,6 +83,7 @@ usage () { echo " --do-rcuscale / --do-no-rcuscale" echo " --do-rcutorture / --do-no-rcutorture" echo " --do-refscale / --do-no-refscale" + echo " --do-rt / --do-no-rt" echo " --do-scftorture / --do-no-scftorture" echo " --duration [ <minutes> | <hours>h | <days>d ]" echo " --kcsan-kmake-arg kernel-make-arguments" @@ -118,6 +120,7 @@ do do_scftorture=yes do_rcuscale=yes do_refscale=yes + do_rt=yes do_kvfree=yes do_kasan=yes do_kcsan=yes @@ -148,6 +151,7 @@ do do_scftorture=no do_rcuscale=no do_refscale=no + do_rt=no do_kvfree=no do_kasan=no do_kcsan=no @@ -162,6 +166,9 @@ do --do-refscale|--do-no-refscale) do_refscale=`doyesno "$1" --do-refscale` ;; + --do-rt|--do-no-rt) + do_rt=`doyesno "$1" --do-rt` + ;; --do-scftorture|--do-no-scftorture) do_scftorture=`doyesno "$1" --do-scftorture` ;; @@ -322,6 +329,7 @@ then echo " --- make clean" > "$amcdir/Make.out" 2>&1 make -j$MAKE_ALLOTED_CPUS clean >> "$amcdir/Make.out" 2>&1 echo " --- make allmodconfig" >> "$amcdir/Make.out" 2>&1 + cp .config $amcdir make -j$MAKE_ALLOTED_CPUS allmodconfig >> "$amcdir/Make.out" 2>&1 echo " --- make " >> "$amcdir/Make.out" 2>&1 make -j$MAKE_ALLOTED_CPUS >> "$amcdir/Make.out" 2>&1 @@ -350,8 +358,19 @@ fi if test "$do_scftorture" = "yes" then - torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot" - torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 1G --trust-make + torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1" + torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make +fi + +if test "$do_rt" = "yes" +then + # With all post-boot grace periods forced to normal. + torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 rcupdate.rcu_normal=1" + torture_set "rcurttorture" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration "$duration_rcutorture" --configs "TREE03" --trust-make + + # With all post-boot grace periods forced to expedited. + torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=30000 rcupdate.rcu_expedited=1" + torture_set "rcurttorture-exp" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration "$duration_rcutorture" --configs "TREE03" --trust-make fi if test "$do_refscale" = yes @@ -363,7 +382,7 @@ fi for prim in $primlist do torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$HALF_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" - torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --bootargs "verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make + torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --bootargs "verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make done if test "$do_rcuscale" = yes @@ -375,13 +394,13 @@ fi for prim in $primlist do torture_bootargs="rcuscale.scale_type="$prim" rcuscale.nwriters=$HALF_ALLOTED_CPUS rcuscale.holdoff=20 torture.disable_onoff_at_boot" - torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make + torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make done if test "$do_kvfree" = "yes" then torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" - torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 1G --trust-make + torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make fi if test "$do_clocksourcewd" = "yes" diff --git a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 index 7093422050f6..6fd6acb94518 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 @@ -8,3 +8,5 @@ CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y CONFIG_RCU_EXPERT=y +CONFIG_FORCE_TASKS_RUDE_RCU=y +#CHECK#CONFIG_TASKS_RUDE_RCU=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N index 2da8b49589a0..07f5e0a70ae7 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N @@ -6,3 +6,5 @@ CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n #CHECK#CONFIG_RCU_EXPERT=n +CONFIG_KPROBES=n +CONFIG_FTRACE=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 index 3ca112444ce7..d84801b9a7ae 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 @@ -7,4 +7,5 @@ CONFIG_PREEMPT=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y +CONFIG_TASKS_RCU=y CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 index ad2be91e5ee7..2f9fcffff5ae 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 @@ -2,3 +2,7 @@ CONFIG_SMP=n CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n +CONFIG_PREEMPT_DYNAMIC=n +#CHECK#CONFIG_TASKS_RCU=y +CONFIG_FORCE_TASKS_RCU=y +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02.boot b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02.boot index cd2a188eeb6d..b9b6d67cbc5f 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02.boot @@ -1 +1,2 @@ rcutorture.torture_type=tasks +rcutorture.stat_interval=60 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 index dc02083803ce..dea26c568678 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 @@ -7,3 +7,5 @@ CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=y #CHECK#CONFIG_RCU_EXPERT=n +CONFIG_TASKS_RCU=y +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 index e4d74e5fc1d0..85b407467454 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 @@ -4,8 +4,11 @@ CONFIG_HOTPLUG_CPU=y CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n +CONFIG_PREEMPT_DYNAMIC=n CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_PROVE_LOCKING=n #CHECK#CONFIG_PROVE_RCU=n +CONFIG_FORCE_TASKS_TRACE_RCU=y +#CHECK#CONFIG_TASKS_TRACE_RCU=y CONFIG_TASKS_TRACE_RCU_READ_MB=y CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 index 77541eeb4e9f..093ea6e8e65c 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 @@ -7,5 +7,7 @@ CONFIG_PREEMPT=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y +CONFIG_FORCE_TASKS_TRACE_RCU=y +#CHECK#CONFIG_TASKS_TRACE_RCU=y CONFIG_TASKS_TRACE_RCU_READ_MB=n CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index 22ad0261728d..ae395981b5e5 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -1,8 +1,9 @@ CONFIG_SMP=y CONFIG_NR_CPUS=8 -CONFIG_PREEMPT_NONE=y -CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=y CONFIG_PREEMPT=n +CONFIG_PREEMPT_DYNAMIC=n #CHECK#CONFIG_TREE_RCU=y CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 index 2789b47e4ecd..d30922d8c883 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 @@ -3,6 +3,7 @@ CONFIG_NR_CPUS=16 CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n +CONFIG_PREEMPT_DYNAMIC=n #CHECK#CONFIG_TREE_RCU=y CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 index 8523a7515cbf..fc45645bb5f4 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 @@ -13,3 +13,5 @@ CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n #CHECK#CONFIG_RCU_EXPERT=n +CONFIG_KPROBES=n +CONFIG_FTRACE=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 index 4a00539bfdd7..a323d8948b7c 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 @@ -3,6 +3,7 @@ CONFIG_NR_CPUS=56 CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n +CONFIG_PREEMPT_DYNAMIC=n #CHECK#CONFIG_TREE_RCU=y CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh index effa415f9b92..e2bc99c785e7 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh @@ -9,7 +9,7 @@ # rcutorture_param_n_barrier_cbs bootparam-string # -# Adds n_barrier_cbs rcutorture module parameter to kernels having it. +# Adds n_barrier_cbs rcutorture module parameter if not already specified. rcutorture_param_n_barrier_cbs () { if echo $1 | grep -q "rcutorture\.n_barrier_cbs" then @@ -30,13 +30,25 @@ rcutorture_param_onoff () { fi } +# rcutorture_param_stat_interval bootparam-string +# +# Adds stat_interval rcutorture module parameter if not already specified. +rcutorture_param_stat_interval () { + if echo $1 | grep -q "rcutorture\.stat_interval" + then + : + else + echo rcutorture.stat_interval=15 + fi +} + # per_version_boot_params bootparam-string config-file seconds # # Adds per-version torture-module parameters to kernels supporting them. per_version_boot_params () { echo $1 `rcutorture_param_onoff "$1" "$2"` \ `rcutorture_param_n_barrier_cbs "$1"` \ - rcutorture.stat_interval=15 \ + `rcutorture_param_stat_interval "$1"` \ rcutorture.shutdown_secs=$3 \ rcutorture.test_no_idle_hz=1 \ rcutorture.verbose=1 diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon index 90942bb5bebc..6a00157bee5b 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon @@ -1,5 +1,6 @@ CONFIG_RCU_SCALE_TEST=y CONFIG_PRINTK_TIME=y -CONFIG_TASKS_RCU_GENERIC=y -CONFIG_TASKS_RCU=y -CONFIG_TASKS_TRACE_RCU=y +CONFIG_FORCE_TASKS_RCU=y +#CHECK#CONFIG_TASKS_RCU=y +CONFIG_FORCE_TASKS_TRACE_RCU=y +#CHECK#CONFIG_TASKS_TRACE_RCU=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE index f110d9ffbe4c..b10706fd03a4 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE @@ -16,3 +16,5 @@ CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y CONFIG_RCU_TRACE=y +CONFIG_KPROBES=n +CONFIG_FTRACE=n diff --git a/tools/testing/selftests/rcutorture/configs/refscale/CFcommon b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon index a98b58b54bb1..fbea3b13baba 100644 --- a/tools/testing/selftests/rcutorture/configs/refscale/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon @@ -1,2 +1,6 @@ CONFIG_RCU_REF_SCALE_TEST=y CONFIG_PRINTK_TIME=y +CONFIG_FORCE_TASKS_RCU=y +#CHECK#CONFIG_TASKS_RCU=y +CONFIG_FORCE_TASKS_TRACE_RCU=y +#CHECK#CONFIG_TASKS_TRACE_RCU=y diff --git a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT index 7f06838a91e6..ef2b501a6971 100644 --- a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT +++ b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT @@ -15,3 +15,5 @@ CONFIG_PROVE_LOCKING=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y +CONFIG_KPROBES=n +CONFIG_FTRACE=n diff --git a/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT index b8429d6c6ebc..3a59346b3de7 100644 --- a/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT +++ b/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT @@ -7,3 +7,5 @@ CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=y CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_PROVE_LOCKING=n +CONFIG_KPROBES=n +CONFIG_FTRACE=n diff --git a/tools/testing/selftests/rcutorture/configs/scf/PREEMPT b/tools/testing/selftests/rcutorture/configs/scf/PREEMPT index ae4992b141b0..cb37e08037d6 100644 --- a/tools/testing/selftests/rcutorture/configs/scf/PREEMPT +++ b/tools/testing/selftests/rcutorture/configs/scf/PREEMPT @@ -7,3 +7,4 @@ CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh index d3d9e35d3d55..2d949e58f5a5 100644 --- a/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh @@ -25,6 +25,5 @@ per_version_boot_params () { echo $1 `scftorture_param_onoff "$1" "$2"` \ scftorture.stat_interval=15 \ scftorture.shutdown_secs=$3 \ - scftorture.verbose=1 \ - scf + scftorture.verbose=1 } diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 59b1dd4a549e..2a3ed401ce46 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -77,7 +77,8 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) idx = srcu_read_lock(&kvm->irq_srcu); - list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link) + list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link, + srcu_read_lock_held(&kvm->irq_srcu)) eventfd_signal(irqfd->resamplefd, 1); srcu_read_unlock(&kvm->irq_srcu, idx); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6d971fb1b08d..5ab12214e18d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1560,7 +1560,7 @@ static int kvm_prepare_memory_region(struct kvm *kvm, r = kvm_arch_prepare_memory_region(kvm, old, new, change); /* Free the bitmap on failure if it was allocated above. */ - if (r && new && new->dirty_bitmap && old && !old->dirty_bitmap) + if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap)) kvm_destroy_dirty_bitmap(new); return r; |