Merge drm/drm-next into drm-intel-next-queued
Backmerging in order to pull "topic/phy-compliance". Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
This commit is contained in:
commit
2b703bbda2
11011 changed files with 485563 additions and 232470 deletions
|
@ -86,6 +86,8 @@ ForEachMacros:
|
||||||
- 'bio_for_each_segment_all'
|
- 'bio_for_each_segment_all'
|
||||||
- 'bio_list_for_each'
|
- 'bio_list_for_each'
|
||||||
- 'bip_for_each_vec'
|
- 'bip_for_each_vec'
|
||||||
|
- 'bitmap_for_each_clear_region'
|
||||||
|
- 'bitmap_for_each_set_region'
|
||||||
- 'blkg_for_each_descendant_post'
|
- 'blkg_for_each_descendant_post'
|
||||||
- 'blkg_for_each_descendant_pre'
|
- 'blkg_for_each_descendant_pre'
|
||||||
- 'blk_queue_for_each_rl'
|
- 'blk_queue_for_each_rl'
|
||||||
|
@ -115,6 +117,7 @@ ForEachMacros:
|
||||||
- 'drm_client_for_each_connector_iter'
|
- 'drm_client_for_each_connector_iter'
|
||||||
- 'drm_client_for_each_modeset'
|
- 'drm_client_for_each_modeset'
|
||||||
- 'drm_connector_for_each_possible_encoder'
|
- 'drm_connector_for_each_possible_encoder'
|
||||||
|
- 'drm_for_each_bridge_in_chain'
|
||||||
- 'drm_for_each_connector_iter'
|
- 'drm_for_each_connector_iter'
|
||||||
- 'drm_for_each_crtc'
|
- 'drm_for_each_crtc'
|
||||||
- 'drm_for_each_encoder'
|
- 'drm_for_each_encoder'
|
||||||
|
@ -136,9 +139,10 @@ ForEachMacros:
|
||||||
- 'for_each_bio'
|
- 'for_each_bio'
|
||||||
- 'for_each_board_func_rsrc'
|
- 'for_each_board_func_rsrc'
|
||||||
- 'for_each_bvec'
|
- 'for_each_bvec'
|
||||||
|
- 'for_each_card_auxs'
|
||||||
|
- 'for_each_card_auxs_safe'
|
||||||
- 'for_each_card_components'
|
- 'for_each_card_components'
|
||||||
- 'for_each_card_links'
|
- 'for_each_card_pre_auxs'
|
||||||
- 'for_each_card_links_safe'
|
|
||||||
- 'for_each_card_prelinks'
|
- 'for_each_card_prelinks'
|
||||||
- 'for_each_card_rtds'
|
- 'for_each_card_rtds'
|
||||||
- 'for_each_card_rtds_safe'
|
- 'for_each_card_rtds_safe'
|
||||||
|
@ -166,6 +170,7 @@ ForEachMacros:
|
||||||
- 'for_each_dpcm_fe'
|
- 'for_each_dpcm_fe'
|
||||||
- 'for_each_drhd_unit'
|
- 'for_each_drhd_unit'
|
||||||
- 'for_each_dss_dev'
|
- 'for_each_dss_dev'
|
||||||
|
- 'for_each_efi_handle'
|
||||||
- 'for_each_efi_memory_desc'
|
- 'for_each_efi_memory_desc'
|
||||||
- 'for_each_efi_memory_desc_in_map'
|
- 'for_each_efi_memory_desc_in_map'
|
||||||
- 'for_each_element'
|
- 'for_each_element'
|
||||||
|
@ -190,6 +195,7 @@ ForEachMacros:
|
||||||
- 'for_each_lru'
|
- 'for_each_lru'
|
||||||
- 'for_each_matching_node'
|
- 'for_each_matching_node'
|
||||||
- 'for_each_matching_node_and_match'
|
- 'for_each_matching_node_and_match'
|
||||||
|
- 'for_each_member'
|
||||||
- 'for_each_memblock'
|
- 'for_each_memblock'
|
||||||
- 'for_each_memblock_type'
|
- 'for_each_memblock_type'
|
||||||
- 'for_each_memcg_cache_index'
|
- 'for_each_memcg_cache_index'
|
||||||
|
@ -200,9 +206,11 @@ ForEachMacros:
|
||||||
- 'for_each_msi_entry'
|
- 'for_each_msi_entry'
|
||||||
- 'for_each_msi_entry_safe'
|
- 'for_each_msi_entry_safe'
|
||||||
- 'for_each_net'
|
- 'for_each_net'
|
||||||
|
- 'for_each_net_continue_reverse'
|
||||||
- 'for_each_netdev'
|
- 'for_each_netdev'
|
||||||
- 'for_each_netdev_continue'
|
- 'for_each_netdev_continue'
|
||||||
- 'for_each_netdev_continue_rcu'
|
- 'for_each_netdev_continue_rcu'
|
||||||
|
- 'for_each_netdev_continue_reverse'
|
||||||
- 'for_each_netdev_feature'
|
- 'for_each_netdev_feature'
|
||||||
- 'for_each_netdev_in_bond_rcu'
|
- 'for_each_netdev_in_bond_rcu'
|
||||||
- 'for_each_netdev_rcu'
|
- 'for_each_netdev_rcu'
|
||||||
|
@ -254,10 +262,10 @@ ForEachMacros:
|
||||||
- 'for_each_reserved_mem_region'
|
- 'for_each_reserved_mem_region'
|
||||||
- 'for_each_rtd_codec_dai'
|
- 'for_each_rtd_codec_dai'
|
||||||
- 'for_each_rtd_codec_dai_rollback'
|
- 'for_each_rtd_codec_dai_rollback'
|
||||||
- 'for_each_rtdcom'
|
- 'for_each_rtd_components'
|
||||||
- 'for_each_rtdcom_safe'
|
|
||||||
- 'for_each_set_bit'
|
- 'for_each_set_bit'
|
||||||
- 'for_each_set_bit_from'
|
- 'for_each_set_bit_from'
|
||||||
|
- 'for_each_set_clump8'
|
||||||
- 'for_each_sg'
|
- 'for_each_sg'
|
||||||
- 'for_each_sg_dma_page'
|
- 'for_each_sg_dma_page'
|
||||||
- 'for_each_sg_page'
|
- 'for_each_sg_page'
|
||||||
|
@ -267,6 +275,7 @@ ForEachMacros:
|
||||||
- 'for_each_subelement_id'
|
- 'for_each_subelement_id'
|
||||||
- '__for_each_thread'
|
- '__for_each_thread'
|
||||||
- 'for_each_thread'
|
- 'for_each_thread'
|
||||||
|
- 'for_each_wakeup_source'
|
||||||
- 'for_each_zone'
|
- 'for_each_zone'
|
||||||
- 'for_each_zone_zonelist'
|
- 'for_each_zone_zonelist'
|
||||||
- 'for_each_zone_zonelist_nodemask'
|
- 'for_each_zone_zonelist_nodemask'
|
||||||
|
@ -330,6 +339,7 @@ ForEachMacros:
|
||||||
- 'list_for_each'
|
- 'list_for_each'
|
||||||
- 'list_for_each_codec'
|
- 'list_for_each_codec'
|
||||||
- 'list_for_each_codec_safe'
|
- 'list_for_each_codec_safe'
|
||||||
|
- 'list_for_each_continue'
|
||||||
- 'list_for_each_entry'
|
- 'list_for_each_entry'
|
||||||
- 'list_for_each_entry_continue'
|
- 'list_for_each_entry_continue'
|
||||||
- 'list_for_each_entry_continue_rcu'
|
- 'list_for_each_entry_continue_rcu'
|
||||||
|
@ -351,6 +361,7 @@ ForEachMacros:
|
||||||
- 'llist_for_each_entry'
|
- 'llist_for_each_entry'
|
||||||
- 'llist_for_each_entry_safe'
|
- 'llist_for_each_entry_safe'
|
||||||
- 'llist_for_each_safe'
|
- 'llist_for_each_safe'
|
||||||
|
- 'mci_for_each_dimm'
|
||||||
- 'media_device_for_each_entity'
|
- 'media_device_for_each_entity'
|
||||||
- 'media_device_for_each_intf'
|
- 'media_device_for_each_intf'
|
||||||
- 'media_device_for_each_link'
|
- 'media_device_for_each_link'
|
||||||
|
@ -444,10 +455,16 @@ ForEachMacros:
|
||||||
- 'virtio_device_for_each_vq'
|
- 'virtio_device_for_each_vq'
|
||||||
- 'xa_for_each'
|
- 'xa_for_each'
|
||||||
- 'xa_for_each_marked'
|
- 'xa_for_each_marked'
|
||||||
|
- 'xa_for_each_range'
|
||||||
- 'xa_for_each_start'
|
- 'xa_for_each_start'
|
||||||
- 'xas_for_each'
|
- 'xas_for_each'
|
||||||
- 'xas_for_each_conflict'
|
- 'xas_for_each_conflict'
|
||||||
- 'xas_for_each_marked'
|
- 'xas_for_each_marked'
|
||||||
|
- 'xbc_array_for_each_value'
|
||||||
|
- 'xbc_for_each_key_value'
|
||||||
|
- 'xbc_node_for_each_array_value'
|
||||||
|
- 'xbc_node_for_each_child'
|
||||||
|
- 'xbc_node_for_each_key_value'
|
||||||
- 'zorro_for_each_dev'
|
- 'zorro_for_each_dev'
|
||||||
|
|
||||||
#IncludeBlocks: Preserve # Unknown to clang-format-5.0
|
#IncludeBlocks: Preserve # Unknown to clang-format-5.0
|
||||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,3 +1,4 @@
|
||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
#
|
#
|
||||||
# NOTE! Don't add files that are generated in specific
|
# NOTE! Don't add files that are generated in specific
|
||||||
# subdirectories here. Add them in the ".gitignore" file
|
# subdirectories here. Add them in the ".gitignore" file
|
||||||
|
|
4
.mailmap
4
.mailmap
|
@ -210,6 +210,7 @@ Oleksij Rempel <linux@rempel-privat.de> <external.Oleksij.Rempel@de.bosch.com>
|
||||||
Oleksij Rempel <linux@rempel-privat.de> <fixed-term.Oleksij.Rempel@de.bosch.com>
|
Oleksij Rempel <linux@rempel-privat.de> <fixed-term.Oleksij.Rempel@de.bosch.com>
|
||||||
Oleksij Rempel <linux@rempel-privat.de> <o.rempel@pengutronix.de>
|
Oleksij Rempel <linux@rempel-privat.de> <o.rempel@pengutronix.de>
|
||||||
Oleksij Rempel <linux@rempel-privat.de> <ore@pengutronix.de>
|
Oleksij Rempel <linux@rempel-privat.de> <ore@pengutronix.de>
|
||||||
|
Pali Rohár <pali@kernel.org> <pali.rohar@gmail.com>
|
||||||
Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
|
Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
|
||||||
Patrick Mochel <mochel@digitalimplant.org>
|
Patrick Mochel <mochel@digitalimplant.org>
|
||||||
Paul Burton <paulburton@kernel.org> <paul.burton@imgtec.com>
|
Paul Burton <paulburton@kernel.org> <paul.burton@imgtec.com>
|
||||||
|
@ -225,6 +226,7 @@ Pratyush Anand <pratyush.anand@gmail.com> <pratyush.anand@st.com>
|
||||||
Praveen BP <praveenbp@ti.com>
|
Praveen BP <praveenbp@ti.com>
|
||||||
Punit Agrawal <punitagrawal@gmail.com> <punit.agrawal@arm.com>
|
Punit Agrawal <punitagrawal@gmail.com> <punit.agrawal@arm.com>
|
||||||
Qais Yousef <qsyousef@gmail.com> <qais.yousef@imgtec.com>
|
Qais Yousef <qsyousef@gmail.com> <qais.yousef@imgtec.com>
|
||||||
|
Quentin Monnet <quentin@isovalent.com> <quentin.monnet@netronome.com>
|
||||||
Quentin Perret <qperret@qperret.net> <quentin.perret@arm.com>
|
Quentin Perret <qperret@qperret.net> <quentin.perret@arm.com>
|
||||||
Rafael J. Wysocki <rjw@rjwysocki.net> <rjw@sisk.pl>
|
Rafael J. Wysocki <rjw@rjwysocki.net> <rjw@sisk.pl>
|
||||||
Rajesh Shah <rajesh.shah@intel.com>
|
Rajesh Shah <rajesh.shah@intel.com>
|
||||||
|
@ -243,9 +245,11 @@ Santosh Shilimkar <ssantosh@kernel.org>
|
||||||
Santosh Shilimkar <santosh.shilimkar@oracle.org>
|
Santosh Shilimkar <santosh.shilimkar@oracle.org>
|
||||||
Sascha Hauer <s.hauer@pengutronix.de>
|
Sascha Hauer <s.hauer@pengutronix.de>
|
||||||
S.Çağlar Onur <caglar@pardus.org.tr>
|
S.Çağlar Onur <caglar@pardus.org.tr>
|
||||||
|
Sakari Ailus <sakari.ailus@linux.intel.com> <sakari.ailus@iki.fi>
|
||||||
Sean Nyekjaer <sean@geanix.com> <sean.nyekjaer@prevas.dk>
|
Sean Nyekjaer <sean@geanix.com> <sean.nyekjaer@prevas.dk>
|
||||||
Sebastian Reichel <sre@kernel.org> <sre@debian.org>
|
Sebastian Reichel <sre@kernel.org> <sre@debian.org>
|
||||||
Sebastian Reichel <sre@kernel.org> <sebastian.reichel@collabora.co.uk>
|
Sebastian Reichel <sre@kernel.org> <sebastian.reichel@collabora.co.uk>
|
||||||
|
Sedat Dilek <sedat.dilek@gmail.com> <sedat.dilek@credativ.de>
|
||||||
Shiraz Hashim <shiraz.linux.kernel@gmail.com> <shiraz.hashim@st.com>
|
Shiraz Hashim <shiraz.linux.kernel@gmail.com> <shiraz.hashim@st.com>
|
||||||
Shuah Khan <shuah@kernel.org> <shuahkhan@gmail.com>
|
Shuah Khan <shuah@kernel.org> <shuahkhan@gmail.com>
|
||||||
Shuah Khan <shuah@kernel.org> <shuah.khan@hp.com>
|
Shuah Khan <shuah@kernel.org> <shuah.khan@hp.com>
|
||||||
|
|
2
COPYING
2
COPYING
|
@ -16,3 +16,5 @@ In addition, other licenses may also apply. Please see:
|
||||||
Documentation/process/license-rules.rst
|
Documentation/process/license-rules.rst
|
||||||
|
|
||||||
for more details.
|
for more details.
|
||||||
|
|
||||||
|
All contributions to the Linux Kernel are subject to this COPYING file.
|
||||||
|
|
5
CREDITS
5
CREDITS
|
@ -567,6 +567,11 @@ D: Original author of Amiga FFS filesystem
|
||||||
S: Orlando, Florida
|
S: Orlando, Florida
|
||||||
S: USA
|
S: USA
|
||||||
|
|
||||||
|
N: Paul Burton
|
||||||
|
E: paulburton@kernel.org
|
||||||
|
W: https://pburton.com
|
||||||
|
D: MIPS maintainer 2018-2020
|
||||||
|
|
||||||
N: Lennert Buytenhek
|
N: Lennert Buytenhek
|
||||||
E: kernel@wantstofly.org
|
E: kernel@wantstofly.org
|
||||||
D: Original (2.4) rewrite of the ethernet bridging code
|
D: Original (2.4) rewrite of the ethernet bridging code
|
||||||
|
|
1
Documentation/.gitignore
vendored
1
Documentation/.gitignore
vendored
|
@ -1,2 +1,3 @@
|
||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
output
|
output
|
||||||
*.pyc
|
*.pyc
|
||||||
|
|
9
Documentation/ABI/obsolete/sysfs-kernel-fadump_enabled
Normal file
9
Documentation/ABI/obsolete/sysfs-kernel-fadump_enabled
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
This ABI is renamed and moved to a new location /sys/kernel/fadump/enabled.
|
||||||
|
|
||||||
|
What: /sys/kernel/fadump_enabled
|
||||||
|
Date: Feb 2012
|
||||||
|
Contact: linuxppc-dev@lists.ozlabs.org
|
||||||
|
Description: read only
|
||||||
|
Primarily used to identify whether the FADump is enabled in
|
||||||
|
the kernel or not.
|
||||||
|
User: Kdump service
|
10
Documentation/ABI/obsolete/sysfs-kernel-fadump_registered
Normal file
10
Documentation/ABI/obsolete/sysfs-kernel-fadump_registered
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
This ABI is renamed and moved to a new location /sys/kernel/fadump/registered.¬
|
||||||
|
|
||||||
|
What: /sys/kernel/fadump_registered
|
||||||
|
Date: Feb 2012
|
||||||
|
Contact: linuxppc-dev@lists.ozlabs.org
|
||||||
|
Description: read/write
|
||||||
|
Helps to control the dump collect feature from userspace.
|
||||||
|
Setting 1 to this file enables the system to collect the
|
||||||
|
dump and 0 to disable it.
|
||||||
|
User: Kdump service
|
10
Documentation/ABI/obsolete/sysfs-kernel-fadump_release_mem
Normal file
10
Documentation/ABI/obsolete/sysfs-kernel-fadump_release_mem
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
This ABI is renamed and moved to a new location /sys/kernel/fadump/release_mem.¬
|
||||||
|
|
||||||
|
What: /sys/kernel/fadump_release_mem
|
||||||
|
Date: Feb 2012
|
||||||
|
Contact: linuxppc-dev@lists.ozlabs.org
|
||||||
|
Description: write only
|
||||||
|
This is a special sysfs file and only available when
|
||||||
|
the system is booted to capture the vmcore using FADump.
|
||||||
|
It is used to release the memory reserved by FADump to
|
||||||
|
save the crash dump.
|
23
Documentation/ABI/obsolete/sysfs-selinux-checkreqprot
Normal file
23
Documentation/ABI/obsolete/sysfs-selinux-checkreqprot
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
What: /sys/fs/selinux/checkreqprot
|
||||||
|
Date: April 2005 (predates git)
|
||||||
|
KernelVersion: 2.6.12-rc2 (predates git)
|
||||||
|
Contact: selinux@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
|
||||||
|
The selinuxfs "checkreqprot" node allows SELinux to be configured
|
||||||
|
to check the protection requested by userspace for mmap/mprotect
|
||||||
|
calls instead of the actual protection applied by the kernel.
|
||||||
|
This was a compatibility mechanism for legacy userspace and
|
||||||
|
for the READ_IMPLIES_EXEC personality flag. However, if set to
|
||||||
|
1, it weakens security by allowing mappings to be made executable
|
||||||
|
without authorization by policy. The default value of checkreqprot
|
||||||
|
at boot was changed starting in Linux v4.4 to 0 (i.e. check the
|
||||||
|
actual protection), and Android and Linux distributions have been
|
||||||
|
explicitly writing a "0" to /sys/fs/selinux/checkreqprot during
|
||||||
|
initialization for some time. Support for setting checkreqprot to 1
|
||||||
|
will be removed in a future kernel release, at which point the kernel
|
||||||
|
will always cease using checkreqprot internally and will always
|
||||||
|
check the actual protections being applied upon mmap/mprotect calls.
|
||||||
|
The checkreqprot selinuxfs node will remain for backward compatibility
|
||||||
|
but will discard writes of the "0" value and will reject writes of the
|
||||||
|
"1" value when this mechanism is removed.
|
|
@ -0,0 +1,9 @@
|
||||||
|
This ABI is moved to /sys/firmware/opal/mpipl/release_core.
|
||||||
|
|
||||||
|
What: /sys/kernel/fadump_release_opalcore
|
||||||
|
Date: Sep 2019
|
||||||
|
Contact: linuxppc-dev@lists.ozlabs.org
|
||||||
|
Description: write only
|
||||||
|
The sysfs file is available when the system is booted to
|
||||||
|
collect the dump on OPAL based machine. It used to release
|
||||||
|
the memory used to collect the opalcore.
|
|
@ -1,5 +1,5 @@
|
||||||
What: /sys/kernel/uids/<uid>/cpu_shares
|
What: /sys/kernel/uids/<uid>/cpu_shares
|
||||||
Date: December 2007
|
Date: December 2007, finally removed in kernel v2.6.34-rc1
|
||||||
Contact: Dhaval Giani <dhaval@linux.vnet.ibm.com>
|
Contact: Dhaval Giani <dhaval@linux.vnet.ibm.com>
|
||||||
Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
|
Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
|
||||||
Description:
|
Description:
|
|
@ -194,11 +194,3 @@ Description:
|
||||||
|
|
||||||
destroy_link write '1' to this attribute to destroy an
|
destroy_link write '1' to this attribute to destroy an
|
||||||
active link
|
active link
|
||||||
|
|
||||||
What: /sys/kernel/config/rdma_cm/<hca>/ports/<port-num>/default_roce_tos
|
|
||||||
Date: March 8, 2019
|
|
||||||
KernelVersion: 5.2
|
|
||||||
Description: RDMA-CM QPs from HCA <hca> at port <port-num>
|
|
||||||
will be created with this TOS as default.
|
|
||||||
This can be overridden by using the rdma_set_option API.
|
|
||||||
The possible RoCE TOS values are 0-255.
|
|
|
@ -43,6 +43,20 @@ Description: Allows the root user to read or write directly through the
|
||||||
If the IOMMU is disabled, it also allows the root user to read
|
If the IOMMU is disabled, it also allows the root user to read
|
||||||
or write from the host a device VA of a host mapped memory
|
or write from the host a device VA of a host mapped memory
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/habanalabs/hl<n>/data64
|
||||||
|
Date: Jan 2020
|
||||||
|
KernelVersion: 5.6
|
||||||
|
Contact: oded.gabbay@gmail.com
|
||||||
|
Description: Allows the root user to read or write 64 bit data directly
|
||||||
|
through the device's PCI bar. Writing to this file generates a
|
||||||
|
write transaction while reading from the file generates a read
|
||||||
|
transaction. This custom interface is needed (instead of using
|
||||||
|
the generic Linux user-space PCI mapping) because the DDR bar
|
||||||
|
is very small compared to the DDR memory and only the driver can
|
||||||
|
move the bar before and after the transaction.
|
||||||
|
If the IOMMU is disabled, it also allows the root user to read
|
||||||
|
or write from the host a device VA of a host mapped memory
|
||||||
|
|
||||||
What: /sys/kernel/debug/habanalabs/hl<n>/device
|
What: /sys/kernel/debug/habanalabs/hl<n>/device
|
||||||
Date: Jan 2019
|
Date: Jan 2019
|
||||||
KernelVersion: 5.1
|
KernelVersion: 5.1
|
||||||
|
|
241
Documentation/ABI/testing/sysfs-bus-coresight-devices-cti
Normal file
241
Documentation/ABI/testing/sysfs-bus-coresight-devices-cti
Normal file
|
@ -0,0 +1,241 @@
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/enable
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (RW) Enable/Disable the CTI hardware.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/powered
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) Indicate if the CTI hardware is powered.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/ctmid
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) Display the associated CTM ID
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/nr_trigger_cons
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) Number of devices connected to triggers on this CTI
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/triggers<N>/name
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) Name of connected device <N>
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/triggers<N>/in_signals
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) Input trigger signals from connected device <N>
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/triggers<N>/in_types
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) Functional types for the input trigger signals
|
||||||
|
from connected device <N>
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/triggers<N>/out_signals
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) Output trigger signals to connected device <N>
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/triggers<N>/out_types
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) Functional types for the output trigger signals
|
||||||
|
to connected device <N>
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/regs/inout_sel
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (RW) Select the index for inen and outen registers.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/regs/inen
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (RW) Read or write the CTIINEN register selected by inout_sel.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/regs/outen
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (RW) Read or write the CTIOUTEN register selected by inout_sel.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/regs/gate
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (RW) Read or write CTIGATE register.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/regs/asicctl
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (RW) Read or write ASICCTL register.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/regs/intack
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (W) Write the INTACK register.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/regs/appset
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (RW) Set CTIAPPSET register to activate channel. Read back to
|
||||||
|
determine current value of register.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/regs/appclear
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (W) Write APPCLEAR register to deactivate channel.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/regs/apppulse
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (W) Write APPPULSE to pulse a channel active for one clock
|
||||||
|
cycle.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/regs/chinstatus
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) Read current status of channel inputs.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/regs/choutstatus
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) read current status of channel outputs.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/regs/triginstatus
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) read current status of input trigger signals
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/regs/trigoutstatus
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) read current status of output trigger signals.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/trigin_attach
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (W) Attach a CTI input trigger to a CTM channel.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/trigin_detach
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (W) Detach a CTI input trigger from a CTM channel.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/trigout_attach
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (W) Attach a CTI output trigger to a CTM channel.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/trigout_detach
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (W) Detach a CTI output trigger from a CTM channel.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_gate_enable
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (RW) Enable CTIGATE for single channel (W) or list enabled
|
||||||
|
channels through the gate (R).
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_gate_disable
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (W) Disable CTIGATE for single channel.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_set
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (W) Activate a single channel.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_clear
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (W) Deactivate a single channel.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_pulse
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (W) Pulse a single channel - activate for a single clock cycle.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/trigout_filtered
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) List of output triggers filtered across all connections.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/trig_filter_enable
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (RW) Enable or disable trigger output signal filtering.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_inuse
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) show channels with at least one attached trigger signal.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_free
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) show channels with no attached trigger signals.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_xtrigs_sel
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (RW) Write channel number to select a channel to view, read to
|
||||||
|
see selected channel number.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_xtrigs_in
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) Read to see input triggers connected to selected view
|
||||||
|
channel.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_xtrigs_out
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (R) Read to see output triggers connected to selected view
|
||||||
|
channel.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/<cti-name>/channels/chan_xtrigs_reset
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion 5.7
|
||||||
|
Contact: Mike Leach or Mathieu Poirier
|
||||||
|
Description: (W) Clear all channel / trigger programming.
|
|
@ -1,3 +1,28 @@
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/cable_fault
|
||||||
|
KernelVersion: 5.7
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Read-only attribute that indicates whether a differential
|
||||||
|
encoder cable fault (not connected or loose wires) is detected
|
||||||
|
for the respective channel of Signal Y. Valid attribute values
|
||||||
|
are boolean. Detection must first be enabled via the
|
||||||
|
corresponding cable_fault_enable attribute.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/cable_fault_enable
|
||||||
|
KernelVersion: 5.7
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Whether detection of differential encoder cable faults for the
|
||||||
|
respective channel of Signal Y is enabled. Valid attribute
|
||||||
|
values are boolean.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/filter_clock_prescaler
|
||||||
|
KernelVersion: 5.7
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Filter clock factor for input Signal Y. This prescaler value
|
||||||
|
affects the inputs of both quadrature pair signals.
|
||||||
|
|
||||||
What: /sys/bus/counter/devices/counterX/signalY/index_polarity
|
What: /sys/bus/counter/devices/counterX/signalY/index_polarity
|
||||||
KernelVersion: 5.2
|
KernelVersion: 5.2
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
|
|
@ -2,17 +2,22 @@ What: /sys/bus/iio/devices/iio:deviceX/ac_excitation_en
|
||||||
KernelVersion:
|
KernelVersion:
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Reading gives the state of AC excitation.
|
This attribute, if available, is used to enable the AC
|
||||||
Writing '1' enables AC excitation.
|
excitation mode found on some converters. In ac excitation mode,
|
||||||
|
the polarity of the excitation voltage is reversed on
|
||||||
|
alternate cycles, to eliminate DC errors.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/bridge_switch_en
|
What: /sys/bus/iio/devices/iio:deviceX/bridge_switch_en
|
||||||
KernelVersion:
|
KernelVersion:
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
This bridge switch is used to disconnect it when there is a
|
This attribute, if available, is used to close or open the
|
||||||
need to minimize the system current consumption.
|
bridge power down switch found on some converters.
|
||||||
Reading gives the state of the bridge switch.
|
In bridge applications, such as strain gauges and load cells,
|
||||||
Writing '1' enables the bridge switch.
|
the bridge itself consumes the majority of the current in the
|
||||||
|
system. To minimize the current consumption of the system,
|
||||||
|
the bridge can be disconnected (when it is not being used
|
||||||
|
using the bridge_switch_en attribute.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_voltagex_sys_calibration
|
What: /sys/bus/iio/devices/iio:deviceX/in_voltagex_sys_calibration
|
||||||
KernelVersion:
|
KernelVersion:
|
||||||
|
@ -21,6 +26,13 @@ Description:
|
||||||
Initiates the system calibration procedure. This is done on a
|
Initiates the system calibration procedure. This is done on a
|
||||||
single channel at a time. Write '1' to start the calibration.
|
single channel at a time. Write '1' to start the calibration.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_voltage2-voltage2_shorted_raw
|
||||||
|
KernelVersion:
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Measure voltage from AIN2 pin connected to AIN(+)
|
||||||
|
and AIN(-) shorted.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_voltagex_sys_calibration_mode_available
|
What: /sys/bus/iio/devices/iio:deviceX/in_voltagex_sys_calibration_mode_available
|
||||||
KernelVersion:
|
KernelVersion:
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
|
|
@ -40,3 +40,11 @@ Description: (RW) Trigger window switch for the MSC's buffer, in
|
||||||
triggering a window switch for the buffer. Returns an error in any
|
triggering a window switch for the buffer. Returns an error in any
|
||||||
other operating mode or attempts to write something other than "1".
|
other operating mode or attempts to write something other than "1".
|
||||||
|
|
||||||
|
What: /sys/bus/intel_th/devices/<intel_th_id>-msc<msc-id>/stop_on_full
|
||||||
|
Date: March 2020
|
||||||
|
KernelVersion: 5.7
|
||||||
|
Contact: Alexander Shishkin <alexander.shishkin@linux.intel.com>
|
||||||
|
Description: (RW) Configure whether trace stops when the last available window
|
||||||
|
becomes full (1/y/Y) or wraps around and continues until the next
|
||||||
|
window becomes available again (0/n/N).
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ Contact: Christian Gromm <christian.gromm@microchip.com>
|
||||||
Description:
|
Description:
|
||||||
Provides information about the interface type and the physical
|
Provides information about the interface type and the physical
|
||||||
location of the device. Hardware attached via USB, for instance,
|
location of the device. Hardware attached via USB, for instance,
|
||||||
might return <usb_device 1-1.1:1.0>
|
might return <1-1.1:1.0>
|
||||||
Users:
|
Users:
|
||||||
|
|
||||||
What: /sys/bus/most/devices/.../interface
|
What: /sys/bus/most/devices/.../interface
|
||||||
|
@ -278,25 +278,7 @@ Description:
|
||||||
Indicates whether current channel ran out of buffers.
|
Indicates whether current channel ran out of buffers.
|
||||||
Users:
|
Users:
|
||||||
|
|
||||||
What: /sys/bus/most/drivers/mostcore/add_link
|
What: /sys/bus/most/drivers/most_core/components
|
||||||
Date: March 2017
|
|
||||||
KernelVersion: 4.15
|
|
||||||
Contact: Christian Gromm <christian.gromm@microchip.com>
|
|
||||||
Description:
|
|
||||||
This is used to link a channel to a component of the
|
|
||||||
mostcore. A link created by writing to this file is
|
|
||||||
referred to as pipe.
|
|
||||||
Users:
|
|
||||||
|
|
||||||
What: /sys/bus/most/drivers/mostcore/remove_link
|
|
||||||
Date: March 2017
|
|
||||||
KernelVersion: 4.15
|
|
||||||
Contact: Christian Gromm <christian.gromm@microchip.com>
|
|
||||||
Description:
|
|
||||||
This is used to unlink a channel from a component.
|
|
||||||
Users:
|
|
||||||
|
|
||||||
What: /sys/bus/most/drivers/mostcore/components
|
|
||||||
Date: March 2017
|
Date: March 2017
|
||||||
KernelVersion: 4.15
|
KernelVersion: 4.15
|
||||||
Contact: Christian Gromm <christian.gromm@microchip.com>
|
Contact: Christian Gromm <christian.gromm@microchip.com>
|
||||||
|
@ -304,7 +286,7 @@ Description:
|
||||||
This is used to retrieve a list of registered components.
|
This is used to retrieve a list of registered components.
|
||||||
Users:
|
Users:
|
||||||
|
|
||||||
What: /sys/bus/most/drivers/mostcore/links
|
What: /sys/bus/most/drivers/most_core/links
|
||||||
Date: March 2017
|
Date: March 2017
|
||||||
KernelVersion: 4.15
|
KernelVersion: 4.15
|
||||||
Contact: Christian Gromm <christian.gromm@microchip.com>
|
Contact: Christian Gromm <christian.gromm@microchip.com>
|
|
@ -20,13 +20,13 @@ Date: April 2017
|
||||||
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
Description:
|
Description:
|
||||||
The supported power roles. This attribute can be used to request
|
The supported power roles. This attribute can be used to request
|
||||||
power role swap on the port when the port supports USB Power
|
power role swap on the port. Swapping is supported as
|
||||||
Delivery. Swapping is supported as synchronous operation, so
|
synchronous operation, so write(2) to the attribute will not
|
||||||
write(2) to the attribute will not return until the operation
|
return until the operation has finished. The attribute is
|
||||||
has finished. The attribute is notified about role changes so
|
notified about role changes so that poll(2) on the attribute
|
||||||
that poll(2) on the attribute wakes up. Change on the role will
|
wakes up. Change on the role will also generate uevent
|
||||||
also generate uevent KOBJ_CHANGE. The current role is show in
|
KOBJ_CHANGE. The current role is show in brackets, for example
|
||||||
brackets, for example "[source] sink" when in source mode.
|
"[source] sink" when in source mode.
|
||||||
|
|
||||||
Valid values: source, sink
|
Valid values: source, sink
|
||||||
|
|
||||||
|
@ -108,6 +108,15 @@ Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||||
Description:
|
Description:
|
||||||
Revision number of the supported USB Type-C specification.
|
Revision number of the supported USB Type-C specification.
|
||||||
|
|
||||||
|
What: /sys/class/typec/<port>/orientation
|
||||||
|
Date: February 2020
|
||||||
|
Contact: Badhri Jagan Sridharan <badhri@google.com>
|
||||||
|
Description:
|
||||||
|
Indicates the active orientation of the Type-C connector.
|
||||||
|
Valid values:
|
||||||
|
- "normal": CC1 orientation
|
||||||
|
- "reverse": CC2 orientation
|
||||||
|
- "unknown": Orientation cannot be determined.
|
||||||
|
|
||||||
USB Type-C partner devices (eg. /sys/class/typec/port0-partner/)
|
USB Type-C partner devices (eg. /sys/class/typec/port0-partner/)
|
||||||
|
|
||||||
|
|
16
Documentation/ABI/testing/sysfs-driver-jz4780-efuse
Normal file
16
Documentation/ABI/testing/sysfs-driver-jz4780-efuse
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
What: /sys/devices/*/<our-device>/nvmem
|
||||||
|
Date: December 2017
|
||||||
|
Contact: PrasannaKumar Muralidharan <prasannatsmkumar@gmail.com>
|
||||||
|
Description: read-only access to the efuse on the Ingenic JZ4780 SoC
|
||||||
|
The SoC has a one time programmable 8K efuse that is
|
||||||
|
split into segments. The driver supports read only.
|
||||||
|
The segments are
|
||||||
|
0x000 64 bit Random Number
|
||||||
|
0x008 128 bit Ingenic Chip ID
|
||||||
|
0x018 128 bit Customer ID
|
||||||
|
0x028 3520 bit Reserved
|
||||||
|
0x1E0 8 bit Protect Segment
|
||||||
|
0x1E1 2296 bit HDMI Key
|
||||||
|
0x300 2048 bit Security boot key
|
||||||
|
Users: any user space application which wants to read the Chip
|
||||||
|
and Customer ID
|
39
Documentation/ABI/testing/sysfs-driver-uacce
Normal file
39
Documentation/ABI/testing/sysfs-driver-uacce
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
What: /sys/class/uacce/<dev_name>/api
|
||||||
|
Date: Feb 2020
|
||||||
|
KernelVersion: 5.7
|
||||||
|
Contact: linux-accelerators@lists.ozlabs.org
|
||||||
|
Description: Api of the device
|
||||||
|
Can be any string and up to userspace to parse.
|
||||||
|
Application use the api to match the correct driver
|
||||||
|
|
||||||
|
What: /sys/class/uacce/<dev_name>/flags
|
||||||
|
Date: Feb 2020
|
||||||
|
KernelVersion: 5.7
|
||||||
|
Contact: linux-accelerators@lists.ozlabs.org
|
||||||
|
Description: Attributes of the device, see UACCE_DEV_xxx flag defined in uacce.h
|
||||||
|
|
||||||
|
What: /sys/class/uacce/<dev_name>/available_instances
|
||||||
|
Date: Feb 2020
|
||||||
|
KernelVersion: 5.7
|
||||||
|
Contact: linux-accelerators@lists.ozlabs.org
|
||||||
|
Description: Available instances left of the device
|
||||||
|
Return -ENODEV if uacce_ops get_available_instances is not provided
|
||||||
|
|
||||||
|
What: /sys/class/uacce/<dev_name>/algorithms
|
||||||
|
Date: Feb 2020
|
||||||
|
KernelVersion: 5.7
|
||||||
|
Contact: linux-accelerators@lists.ozlabs.org
|
||||||
|
Description: Algorithms supported by this accelerator, separated by new line.
|
||||||
|
Can be any string and up to userspace to parse.
|
||||||
|
|
||||||
|
What: /sys/class/uacce/<dev_name>/region_mmio_size
|
||||||
|
Date: Feb 2020
|
||||||
|
KernelVersion: 5.7
|
||||||
|
Contact: linux-accelerators@lists.ozlabs.org
|
||||||
|
Description: Size (bytes) of mmio region queue file
|
||||||
|
|
||||||
|
What: /sys/class/uacce/<dev_name>/region_dus_size
|
||||||
|
Date: Feb 2020
|
||||||
|
KernelVersion: 5.7
|
||||||
|
Contact: linux-accelerators@lists.ozlabs.org
|
||||||
|
Description: Size (bytes) of dus region queue file
|
21
Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups
Normal file
21
Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
What: /sys/firmware/opal/sensor_groups
|
||||||
|
Date: August 2017
|
||||||
|
Contact: Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
|
||||||
|
Description: Sensor groups directory for POWER9 powernv servers
|
||||||
|
|
||||||
|
Each folder in this directory contains a sensor group
|
||||||
|
which are classified based on type of the sensor
|
||||||
|
like power, temperature, frequency, current, etc. They
|
||||||
|
can also indicate the group of sensors belonging to
|
||||||
|
different owners like CSM, Profiler, Job-Scheduler
|
||||||
|
|
||||||
|
What: /sys/firmware/opal/sensor_groups/<sensor_group_name>/clear
|
||||||
|
Date: August 2017
|
||||||
|
Contact: Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
|
||||||
|
Description: Sysfs file to clear the min-max of all the sensors
|
||||||
|
belonging to the group.
|
||||||
|
|
||||||
|
Writing 1 to this file will clear the minimum and
|
||||||
|
maximum values of all the sensors in the group.
|
||||||
|
In POWER9, the min-max of a sensor is the historical minimum
|
||||||
|
and maximum value of the sensor cached by OCC.
|
|
@ -318,3 +318,8 @@ Date: September 2019
|
||||||
Contact: "Hridya Valsaraju" <hridya@google.com>
|
Contact: "Hridya Valsaraju" <hridya@google.com>
|
||||||
Description: Average number of valid blocks.
|
Description: Average number of valid blocks.
|
||||||
Available when CONFIG_F2FS_STAT_FS=y.
|
Available when CONFIG_F2FS_STAT_FS=y.
|
||||||
|
|
||||||
|
What: /sys/fs/f2fs/<disk>/mounted_time_sec
|
||||||
|
Date: February 2020
|
||||||
|
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||||
|
Description: Show the mounted time in secs of this partition.
|
||||||
|
|
40
Documentation/ABI/testing/sysfs-kernel-fadump
Normal file
40
Documentation/ABI/testing/sysfs-kernel-fadump
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
What: /sys/kernel/fadump/*
|
||||||
|
Date: Dec 2019
|
||||||
|
Contact: linuxppc-dev@lists.ozlabs.org
|
||||||
|
Description:
|
||||||
|
The /sys/kernel/fadump/* is a collection of FADump sysfs
|
||||||
|
file provide information about the configuration status
|
||||||
|
of Firmware Assisted Dump (FADump).
|
||||||
|
|
||||||
|
What: /sys/kernel/fadump/enabled
|
||||||
|
Date: Dec 2019
|
||||||
|
Contact: linuxppc-dev@lists.ozlabs.org
|
||||||
|
Description: read only
|
||||||
|
Primarily used to identify whether the FADump is enabled in
|
||||||
|
the kernel or not.
|
||||||
|
User: Kdump service
|
||||||
|
|
||||||
|
What: /sys/kernel/fadump/registered
|
||||||
|
Date: Dec 2019
|
||||||
|
Contact: linuxppc-dev@lists.ozlabs.org
|
||||||
|
Description: read/write
|
||||||
|
Helps to control the dump collect feature from userspace.
|
||||||
|
Setting 1 to this file enables the system to collect the
|
||||||
|
dump and 0 to disable it.
|
||||||
|
User: Kdump service
|
||||||
|
|
||||||
|
What: /sys/kernel/fadump/release_mem
|
||||||
|
Date: Dec 2019
|
||||||
|
Contact: linuxppc-dev@lists.ozlabs.org
|
||||||
|
Description: write only
|
||||||
|
This is a special sysfs file and only available when
|
||||||
|
the system is booted to capture the vmcore using FADump.
|
||||||
|
It is used to release the memory reserved by FADump to
|
||||||
|
save the crash dump.
|
||||||
|
|
||||||
|
What: /sys/kernel/fadump/mem_reserved
|
||||||
|
Date: Dec 2019
|
||||||
|
Contact: linuxppc-dev@lists.ozlabs.org
|
||||||
|
Description: read only
|
||||||
|
Provide information about the amount of memory reserved by
|
||||||
|
FADump to save the crash dump in bytes.
|
|
@ -2,7 +2,7 @@ What: /sys/class/leds/dell::kbd_backlight/als_enabled
|
||||||
Date: December 2014
|
Date: December 2014
|
||||||
KernelVersion: 3.19
|
KernelVersion: 3.19
|
||||||
Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>,
|
Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>,
|
||||||
Pali Rohár <pali.rohar@gmail.com>
|
Pali Rohár <pali@kernel.org>
|
||||||
Description:
|
Description:
|
||||||
This file allows to control the automatic keyboard
|
This file allows to control the automatic keyboard
|
||||||
illumination mode on some systems that have an ambient
|
illumination mode on some systems that have an ambient
|
||||||
|
@ -13,7 +13,7 @@ What: /sys/class/leds/dell::kbd_backlight/als_setting
|
||||||
Date: December 2014
|
Date: December 2014
|
||||||
KernelVersion: 3.19
|
KernelVersion: 3.19
|
||||||
Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>,
|
Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>,
|
||||||
Pali Rohár <pali.rohar@gmail.com>
|
Pali Rohár <pali@kernel.org>
|
||||||
Description:
|
Description:
|
||||||
This file allows to specifiy the on/off threshold value,
|
This file allows to specifiy the on/off threshold value,
|
||||||
as reported by the ambient light sensor.
|
as reported by the ambient light sensor.
|
||||||
|
@ -22,7 +22,7 @@ What: /sys/class/leds/dell::kbd_backlight/start_triggers
|
||||||
Date: December 2014
|
Date: December 2014
|
||||||
KernelVersion: 3.19
|
KernelVersion: 3.19
|
||||||
Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>,
|
Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>,
|
||||||
Pali Rohár <pali.rohar@gmail.com>
|
Pali Rohár <pali@kernel.org>
|
||||||
Description:
|
Description:
|
||||||
This file allows to control the input triggers that
|
This file allows to control the input triggers that
|
||||||
turn on the keyboard backlight illumination that is
|
turn on the keyboard backlight illumination that is
|
||||||
|
@ -45,7 +45,7 @@ What: /sys/class/leds/dell::kbd_backlight/stop_timeout
|
||||||
Date: December 2014
|
Date: December 2014
|
||||||
KernelVersion: 3.19
|
KernelVersion: 3.19
|
||||||
Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>,
|
Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>,
|
||||||
Pali Rohár <pali.rohar@gmail.com>
|
Pali Rohár <pali@kernel.org>
|
||||||
Description:
|
Description:
|
||||||
This file allows to specify the interval after which the
|
This file allows to specify the interval after which the
|
||||||
keyboard illumination is disabled because of inactivity.
|
keyboard illumination is disabled because of inactivity.
|
||||||
|
|
|
@ -154,3 +154,10 @@ Description:
|
||||||
device specification. For example, when user sets 7bytes on
|
device specification. For example, when user sets 7bytes on
|
||||||
16550A, which has 1/4/8/14 bytes trigger, the RX trigger is
|
16550A, which has 1/4/8/14 bytes trigger, the RX trigger is
|
||||||
automatically changed to 4 bytes.
|
automatically changed to 4 bytes.
|
||||||
|
|
||||||
|
What: /sys/class/tty/ttyS0/console
|
||||||
|
Date: February 2020
|
||||||
|
Contact: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
|
||||||
|
Description:
|
||||||
|
Allows user to detach or attach back the given device as
|
||||||
|
kernel console. It shows and accepts a boolean variable.
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
# Makefile for Sphinx documentation
|
# Makefile for Sphinx documentation
|
||||||
#
|
#
|
||||||
|
|
||||||
subdir-y := devicetree/bindings/
|
# for cleaning
|
||||||
|
subdir- := devicetree/bindings
|
||||||
|
|
||||||
# Check for broken documentation file references
|
# Check for broken documentation file references
|
||||||
ifeq ($(CONFIG_WARN_MISSING_DOCUMENTS),y)
|
ifeq ($(CONFIG_WARN_MISSING_DOCUMENTS),y)
|
||||||
|
@ -13,7 +14,7 @@ endif
|
||||||
SPHINXBUILD = sphinx-build
|
SPHINXBUILD = sphinx-build
|
||||||
SPHINXOPTS =
|
SPHINXOPTS =
|
||||||
SPHINXDIRS = .
|
SPHINXDIRS = .
|
||||||
_SPHINXDIRS = $(patsubst $(srctree)/Documentation/%/index.rst,%,$(wildcard $(srctree)/Documentation/*/index.rst))
|
_SPHINXDIRS = $(sort $(patsubst $(srctree)/Documentation/%/index.rst,%,$(wildcard $(srctree)/Documentation/*/index.rst)))
|
||||||
SPHINX_CONF = conf.py
|
SPHINX_CONF = conf.py
|
||||||
PAPER =
|
PAPER =
|
||||||
BUILDDIR = $(obj)/output
|
BUILDDIR = $(obj)/output
|
||||||
|
|
155
Documentation/PCI/boot-interrupts.rst
Normal file
155
Documentation/PCI/boot-interrupts.rst
Normal file
|
@ -0,0 +1,155 @@
|
||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
===============
|
||||||
|
Boot Interrupts
|
||||||
|
===============
|
||||||
|
|
||||||
|
:Author: - Sean V Kelley <sean.v.kelley@linux.intel.com>
|
||||||
|
|
||||||
|
Overview
|
||||||
|
========
|
||||||
|
|
||||||
|
On PCI Express, interrupts are represented with either MSI or inbound
|
||||||
|
interrupt messages (Assert_INTx/Deassert_INTx). The integrated IO-APIC in a
|
||||||
|
given Core IO converts the legacy interrupt messages from PCI Express to
|
||||||
|
MSI interrupts. If the IO-APIC is disabled (via the mask bits in the
|
||||||
|
IO-APIC table entries), the messages are routed to the legacy PCH. This
|
||||||
|
in-band interrupt mechanism was traditionally necessary for systems that
|
||||||
|
did not support the IO-APIC and for boot. Intel in the past has used the
|
||||||
|
term "boot interrupts" to describe this mechanism. Further, the PCI Express
|
||||||
|
protocol describes this in-band legacy wire-interrupt INTx mechanism for
|
||||||
|
I/O devices to signal PCI-style level interrupts. The subsequent paragraphs
|
||||||
|
describe problems with the Core IO handling of INTx message routing to the
|
||||||
|
PCH and mitigation within BIOS and the OS.
|
||||||
|
|
||||||
|
|
||||||
|
Issue
|
||||||
|
=====
|
||||||
|
|
||||||
|
When in-band legacy INTx messages are forwarded to the PCH, they in turn
|
||||||
|
trigger a new interrupt for which the OS likely lacks a handler. When an
|
||||||
|
interrupt goes unhandled over time, they are tracked by the Linux kernel as
|
||||||
|
Spurious Interrupts. The IRQ will be disabled by the Linux kernel after it
|
||||||
|
reaches a specific count with the error "nobody cared". This disabled IRQ
|
||||||
|
now prevents valid usage by an existing interrupt which may happen to share
|
||||||
|
the IRQ line.
|
||||||
|
|
||||||
|
irq 19: nobody cared (try booting with the "irqpoll" option)
|
||||||
|
CPU: 0 PID: 2988 Comm: irq/34-nipalk Tainted: 4.14.87-rt49-02410-g4a640ec-dirty #1
|
||||||
|
Hardware name: National Instruments NI PXIe-8880/NI PXIe-8880, BIOS 2.1.5f1 01/09/2020
|
||||||
|
Call Trace:
|
||||||
|
<IRQ>
|
||||||
|
? dump_stack+0x46/0x5e
|
||||||
|
? __report_bad_irq+0x2e/0xb0
|
||||||
|
? note_interrupt+0x242/0x290
|
||||||
|
? nNIKAL100_memoryRead16+0x8/0x10 [nikal]
|
||||||
|
? handle_irq_event_percpu+0x55/0x70
|
||||||
|
? handle_irq_event+0x4f/0x80
|
||||||
|
? handle_fasteoi_irq+0x81/0x180
|
||||||
|
? handle_irq+0x1c/0x30
|
||||||
|
? do_IRQ+0x41/0xd0
|
||||||
|
? common_interrupt+0x84/0x84
|
||||||
|
</IRQ>
|
||||||
|
|
||||||
|
handlers:
|
||||||
|
irq_default_primary_handler threaded usb_hcd_irq
|
||||||
|
Disabling IRQ #19
|
||||||
|
|
||||||
|
|
||||||
|
Conditions
|
||||||
|
==========
|
||||||
|
|
||||||
|
The use of threaded interrupts is the most likely condition to trigger
|
||||||
|
this problem today. Threaded interrupts may not be reenabled after the IRQ
|
||||||
|
handler wakes. These "one shot" conditions mean that the threaded interrupt
|
||||||
|
needs to keep the interrupt line masked until the threaded handler has run.
|
||||||
|
Especially when dealing with high data rate interrupts, the thread needs to
|
||||||
|
run to completion; otherwise some handlers will end up in stack overflows
|
||||||
|
since the interrupt of the issuing device is still active.
|
||||||
|
|
||||||
|
Affected Chipsets
|
||||||
|
=================
|
||||||
|
|
||||||
|
The legacy interrupt forwarding mechanism exists today in a number of
|
||||||
|
devices including but not limited to chipsets from AMD/ATI, Broadcom, and
|
||||||
|
Intel. Changes made through the mitigations below have been applied to
|
||||||
|
drivers/pci/quirks.c
|
||||||
|
|
||||||
|
Starting with ICX there are no longer any IO-APICs in the Core IO's
|
||||||
|
devices. IO-APIC is only in the PCH. Devices connected to the Core IO's
|
||||||
|
PCIe Root Ports will use native MSI/MSI-X mechanisms.
|
||||||
|
|
||||||
|
Mitigations
|
||||||
|
===========
|
||||||
|
|
||||||
|
The mitigations take the form of PCI quirks. The preference has been to
|
||||||
|
first identify and make use of a means to disable the routing to the PCH.
|
||||||
|
In such a case a quirk to disable boot interrupt generation can be
|
||||||
|
added.[1]
|
||||||
|
|
||||||
|
Intel® 6300ESB I/O Controller Hub
|
||||||
|
Alternate Base Address Register:
|
||||||
|
BIE: Boot Interrupt Enable
|
||||||
|
0 = Boot interrupt is enabled.
|
||||||
|
1 = Boot interrupt is disabled.
|
||||||
|
|
||||||
|
Intel® Sandy Bridge through Sky Lake based Xeon servers:
|
||||||
|
Coherent Interface Protocol Interrupt Control
|
||||||
|
dis_intx_route2pch/dis_intx_route2ich/dis_intx_route2dmi2:
|
||||||
|
When this bit is set. Local INTx messages received from the
|
||||||
|
Intel® Quick Data DMA/PCI Express ports are not routed to legacy
|
||||||
|
PCH - they are either converted into MSI via the integrated IO-APIC
|
||||||
|
(if the IO-APIC mask bit is clear in the appropriate entries)
|
||||||
|
or cause no further action (when mask bit is set)
|
||||||
|
|
||||||
|
In the absence of a way to directly disable the routing, another approach
|
||||||
|
has been to make use of PCI Interrupt pin to INTx routing tables for
|
||||||
|
purposes of redirecting the interrupt handler to the rerouted interrupt
|
||||||
|
line by default. Therefore, on chipsets where this INTx routing cannot be
|
||||||
|
disabled, the Linux kernel will reroute the valid interrupt to its legacy
|
||||||
|
interrupt. This redirection of the handler will prevent the occurrence of
|
||||||
|
the spurious interrupt detection which would ordinarily disable the IRQ
|
||||||
|
line due to excessive unhandled counts.[2]
|
||||||
|
|
||||||
|
The config option X86_REROUTE_FOR_BROKEN_BOOT_IRQS exists to enable (or
|
||||||
|
disable) the redirection of the interrupt handler to the PCH interrupt
|
||||||
|
line. The option can be overridden by either pci=ioapicreroute or
|
||||||
|
pci=noioapicreroute.[3]
|
||||||
|
|
||||||
|
|
||||||
|
More Documentation
|
||||||
|
==================
|
||||||
|
|
||||||
|
There is an overview of the legacy interrupt handling in several datasheets
|
||||||
|
(6300ESB and 6700PXH below). While largely the same, it provides insight
|
||||||
|
into the evolution of its handling with chipsets.
|
||||||
|
|
||||||
|
Example of disabling of the boot interrupt
|
||||||
|
------------------------------------------
|
||||||
|
|
||||||
|
Intel® 6300ESB I/O Controller Hub (Document # 300641-004US)
|
||||||
|
5.7.3 Boot Interrupt
|
||||||
|
https://www.intel.com/content/dam/doc/datasheet/6300esb-io-controller-hub-datasheet.pdf
|
||||||
|
|
||||||
|
Intel® Xeon® Processor E5-1600/2400/2600/4600 v3 Product Families
|
||||||
|
Datasheet - Volume 2: Registers (Document # 330784-003)
|
||||||
|
6.6.41 cipintrc Coherent Interface Protocol Interrupt Control
|
||||||
|
https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/xeon-e5-v3-datasheet-vol-2.pdf
|
||||||
|
|
||||||
|
Example of handler rerouting
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
Intel® 6700PXH 64-bit PCI Hub (Document # 302628)
|
||||||
|
2.15.2 PCI Express Legacy INTx Support and Boot Interrupt
|
||||||
|
https://www.intel.com/content/dam/doc/datasheet/6700pxh-64-bit-pci-hub-datasheet.pdf
|
||||||
|
|
||||||
|
|
||||||
|
If you have any legacy PCI interrupt questions that aren't answered, email me.
|
||||||
|
|
||||||
|
Cheers,
|
||||||
|
Sean V Kelley
|
||||||
|
sean.v.kelley@linux.intel.com
|
||||||
|
|
||||||
|
[1] https://lore.kernel.org/r/12131949181903-git-send-email-sassmann@suse.de/
|
||||||
|
[2] https://lore.kernel.org/r/12131949182094-git-send-email-sassmann@suse.de/
|
||||||
|
[3] https://lore.kernel.org/r/487C8EA7.6020205@suse.de/
|
|
@ -16,3 +16,4 @@ Linux PCI Bus Subsystem
|
||||||
pci-error-recovery
|
pci-error-recovery
|
||||||
pcieaer-howto
|
pcieaer-howto
|
||||||
endpoint/index
|
endpoint/index
|
||||||
|
boot-interrupts
|
||||||
|
|
|
@ -239,7 +239,7 @@ from the PCI device config space. Use the values in the pci_dev structure
|
||||||
as the PCI "bus address" might have been remapped to a "host physical"
|
as the PCI "bus address" might have been remapped to a "host physical"
|
||||||
address by the arch/chip-set specific kernel support.
|
address by the arch/chip-set specific kernel support.
|
||||||
|
|
||||||
See Documentation/io-mapping.txt for how to access device registers
|
See Documentation/driver-api/io-mapping.rst for how to access device registers
|
||||||
or device memory.
|
or device memory.
|
||||||
|
|
||||||
The device driver needs to call pci_request_region() to verify
|
The device driver needs to call pci_request_region() to verify
|
||||||
|
|
|
@ -156,12 +156,6 @@ default reset_link function, but different upstream ports might
|
||||||
have different specifications to reset pci express link, so all
|
have different specifications to reset pci express link, so all
|
||||||
upstream ports should provide their own reset_link functions.
|
upstream ports should provide their own reset_link functions.
|
||||||
|
|
||||||
In struct pcie_port_service_driver, a new pointer, reset_link, is
|
|
||||||
added.
|
|
||||||
::
|
|
||||||
|
|
||||||
pci_ers_result_t (*reset_link) (struct pci_dev *dev);
|
|
||||||
|
|
||||||
Section 3.2.2.2 provides more detailed info on when to call
|
Section 3.2.2.2 provides more detailed info on when to call
|
||||||
reset_link.
|
reset_link.
|
||||||
|
|
||||||
|
@ -212,15 +206,10 @@ error_detected(dev, pci_channel_io_frozen) to all drivers within
|
||||||
a hierarchy in question. Then, performing link reset at upstream is
|
a hierarchy in question. Then, performing link reset at upstream is
|
||||||
necessary. As different kinds of devices might use different approaches
|
necessary. As different kinds of devices might use different approaches
|
||||||
to reset link, AER port service driver is required to provide the
|
to reset link, AER port service driver is required to provide the
|
||||||
function to reset link. Firstly, kernel looks for if the upstream
|
function to reset link via callback parameter of pcie_do_recovery()
|
||||||
component has an aer driver. If it has, kernel uses the reset_link
|
function. If reset_link is not NULL, recovery function will use it
|
||||||
callback of the aer driver. If the upstream component has no aer driver
|
to reset the link. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER
|
||||||
and the port is downstream port, we will perform a hot reset as the
|
and reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes
|
||||||
default by setting the Secondary Bus Reset bit of the Bridge Control
|
|
||||||
register associated with the downstream port. As for upstream ports,
|
|
||||||
they should provide their own aer service drivers with reset_link
|
|
||||||
function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and
|
|
||||||
reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes
|
|
||||||
to mmio_enabled.
|
to mmio_enabled.
|
||||||
|
|
||||||
helper functions
|
helper functions
|
||||||
|
@ -243,9 +232,9 @@ messages to root port when an error is detected.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);`
|
int pci_aer_clear_nonfatal_status(struct pci_dev *dev);`
|
||||||
|
|
||||||
pci_cleanup_aer_uncorrect_error_status cleanups the uncorrectable
|
pci_aer_clear_nonfatal_status clears non-fatal errors in the uncorrectable
|
||||||
error status register.
|
error status register.
|
||||||
|
|
||||||
Frequent Asked Questions
|
Frequent Asked Questions
|
||||||
|
|
|
@ -4,7 +4,7 @@ A Tour Through TREE_RCU's Grace-Period Memory Ordering
|
||||||
|
|
||||||
August 8, 2017
|
August 8, 2017
|
||||||
|
|
||||||
This article was contributed by Paul E. McKenney
|
This article was contributed by Paul E. McKenney
|
||||||
|
|
||||||
Introduction
|
Introduction
|
||||||
============
|
============
|
||||||
|
@ -48,7 +48,7 @@ Tree RCU Grace Period Memory Ordering Building Blocks
|
||||||
|
|
||||||
The workhorse for RCU's grace-period memory ordering is the
|
The workhorse for RCU's grace-period memory ordering is the
|
||||||
critical section for the ``rcu_node`` structure's
|
critical section for the ``rcu_node`` structure's
|
||||||
``->lock``. These critical sections use helper functions for lock
|
``->lock``. These critical sections use helper functions for lock
|
||||||
acquisition, including ``raw_spin_lock_rcu_node()``,
|
acquisition, including ``raw_spin_lock_rcu_node()``,
|
||||||
``raw_spin_lock_irq_rcu_node()``, and ``raw_spin_lock_irqsave_rcu_node()``.
|
``raw_spin_lock_irq_rcu_node()``, and ``raw_spin_lock_irqsave_rcu_node()``.
|
||||||
Their lock-release counterparts are ``raw_spin_unlock_rcu_node()``,
|
Their lock-release counterparts are ``raw_spin_unlock_rcu_node()``,
|
||||||
|
@ -102,9 +102,9 @@ lock-acquisition and lock-release functions::
|
||||||
23 r3 = READ_ONCE(x);
|
23 r3 = READ_ONCE(x);
|
||||||
24 }
|
24 }
|
||||||
25
|
25
|
||||||
26 WARN_ON(r1 == 0 && r2 == 0 && r3 == 0);
|
26 WARN_ON(r1 == 0 && r2 == 0 && r3 == 0);
|
||||||
|
|
||||||
The ``WARN_ON()`` is evaluated at “the end of time”,
|
The ``WARN_ON()`` is evaluated at "the end of time",
|
||||||
after all changes have propagated throughout the system.
|
after all changes have propagated throughout the system.
|
||||||
Without the ``smp_mb__after_unlock_lock()`` provided by the
|
Without the ``smp_mb__after_unlock_lock()`` provided by the
|
||||||
acquisition functions, this ``WARN_ON()`` could trigger, for example
|
acquisition functions, this ``WARN_ON()`` could trigger, for example
|
||||||
|
|
|
@ -4,12 +4,61 @@ Using RCU to Protect Read-Mostly Linked Lists
|
||||||
=============================================
|
=============================================
|
||||||
|
|
||||||
One of the best applications of RCU is to protect read-mostly linked lists
|
One of the best applications of RCU is to protect read-mostly linked lists
|
||||||
("struct list_head" in list.h). One big advantage of this approach
|
(``struct list_head`` in list.h). One big advantage of this approach
|
||||||
is that all of the required memory barriers are included for you in
|
is that all of the required memory barriers are included for you in
|
||||||
the list macros. This document describes several applications of RCU,
|
the list macros. This document describes several applications of RCU,
|
||||||
with the best fits first.
|
with the best fits first.
|
||||||
|
|
||||||
Example 1: Read-Side Action Taken Outside of Lock, No In-Place Updates
|
|
||||||
|
Example 1: Read-mostly list: Deferred Destruction
|
||||||
|
-------------------------------------------------
|
||||||
|
|
||||||
|
A widely used usecase for RCU lists in the kernel is lockless iteration over
|
||||||
|
all processes in the system. ``task_struct::tasks`` represents the list node that
|
||||||
|
links all the processes. The list can be traversed in parallel to any list
|
||||||
|
additions or removals.
|
||||||
|
|
||||||
|
The traversal of the list is done using ``for_each_process()`` which is defined
|
||||||
|
by the 2 macros::
|
||||||
|
|
||||||
|
#define next_task(p) \
|
||||||
|
list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
|
||||||
|
|
||||||
|
#define for_each_process(p) \
|
||||||
|
for (p = &init_task ; (p = next_task(p)) != &init_task ; )
|
||||||
|
|
||||||
|
The code traversing the list of all processes typically looks like::
|
||||||
|
|
||||||
|
rcu_read_lock();
|
||||||
|
for_each_process(p) {
|
||||||
|
/* Do something with p */
|
||||||
|
}
|
||||||
|
rcu_read_unlock();
|
||||||
|
|
||||||
|
The simplified code for removing a process from a task list is::
|
||||||
|
|
||||||
|
void release_task(struct task_struct *p)
|
||||||
|
{
|
||||||
|
write_lock(&tasklist_lock);
|
||||||
|
list_del_rcu(&p->tasks);
|
||||||
|
write_unlock(&tasklist_lock);
|
||||||
|
call_rcu(&p->rcu, delayed_put_task_struct);
|
||||||
|
}
|
||||||
|
|
||||||
|
When a process exits, ``release_task()`` calls ``list_del_rcu(&p->tasks)`` under
|
||||||
|
``tasklist_lock`` writer lock protection, to remove the task from the list of
|
||||||
|
all tasks. The ``tasklist_lock`` prevents concurrent list additions/removals
|
||||||
|
from corrupting the list. Readers using ``for_each_process()`` are not protected
|
||||||
|
with the ``tasklist_lock``. To prevent readers from noticing changes in the list
|
||||||
|
pointers, the ``task_struct`` object is freed only after one or more grace
|
||||||
|
periods elapse (with the help of call_rcu()). This deferring of destruction
|
||||||
|
ensures that any readers traversing the list will see valid ``p->tasks.next``
|
||||||
|
pointers and deletion/freeing can happen in parallel with traversal of the list.
|
||||||
|
This pattern is also called an **existence lock**, since RCU pins the object in
|
||||||
|
memory until all existing readers finish.
|
||||||
|
|
||||||
|
|
||||||
|
Example 2: Read-Side Action Taken Outside of Lock: No In-Place Updates
|
||||||
----------------------------------------------------------------------
|
----------------------------------------------------------------------
|
||||||
|
|
||||||
The best applications are cases where, if reader-writer locking were
|
The best applications are cases where, if reader-writer locking were
|
||||||
|
@ -26,7 +75,7 @@ added or deleted, rather than being modified in place.
|
||||||
|
|
||||||
A straightforward example of this use of RCU may be found in the
|
A straightforward example of this use of RCU may be found in the
|
||||||
system-call auditing support. For example, a reader-writer locked
|
system-call auditing support. For example, a reader-writer locked
|
||||||
implementation of audit_filter_task() might be as follows::
|
implementation of ``audit_filter_task()`` might be as follows::
|
||||||
|
|
||||||
static enum audit_state audit_filter_task(struct task_struct *tsk)
|
static enum audit_state audit_filter_task(struct task_struct *tsk)
|
||||||
{
|
{
|
||||||
|
@ -34,7 +83,7 @@ implementation of audit_filter_task() might be as follows::
|
||||||
enum audit_state state;
|
enum audit_state state;
|
||||||
|
|
||||||
read_lock(&auditsc_lock);
|
read_lock(&auditsc_lock);
|
||||||
/* Note: audit_netlink_sem held by caller. */
|
/* Note: audit_filter_mutex held by caller. */
|
||||||
list_for_each_entry(e, &audit_tsklist, list) {
|
list_for_each_entry(e, &audit_tsklist, list) {
|
||||||
if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
|
if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
|
||||||
read_unlock(&auditsc_lock);
|
read_unlock(&auditsc_lock);
|
||||||
|
@ -58,7 +107,7 @@ This means that RCU can be easily applied to the read side, as follows::
|
||||||
enum audit_state state;
|
enum audit_state state;
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
/* Note: audit_netlink_sem held by caller. */
|
/* Note: audit_filter_mutex held by caller. */
|
||||||
list_for_each_entry_rcu(e, &audit_tsklist, list) {
|
list_for_each_entry_rcu(e, &audit_tsklist, list) {
|
||||||
if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
|
if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
|
@ -69,18 +118,18 @@ This means that RCU can be easily applied to the read side, as follows::
|
||||||
return AUDIT_BUILD_CONTEXT;
|
return AUDIT_BUILD_CONTEXT;
|
||||||
}
|
}
|
||||||
|
|
||||||
The read_lock() and read_unlock() calls have become rcu_read_lock()
|
The ``read_lock()`` and ``read_unlock()`` calls have become rcu_read_lock()
|
||||||
and rcu_read_unlock(), respectively, and the list_for_each_entry() has
|
and rcu_read_unlock(), respectively, and the list_for_each_entry() has
|
||||||
become list_for_each_entry_rcu(). The _rcu() list-traversal primitives
|
become list_for_each_entry_rcu(). The **_rcu()** list-traversal primitives
|
||||||
insert the read-side memory barriers that are required on DEC Alpha CPUs.
|
insert the read-side memory barriers that are required on DEC Alpha CPUs.
|
||||||
|
|
||||||
The changes to the update side are also straightforward. A reader-writer
|
The changes to the update side are also straightforward. A reader-writer lock
|
||||||
lock might be used as follows for deletion and insertion::
|
might be used as follows for deletion and insertion::
|
||||||
|
|
||||||
static inline int audit_del_rule(struct audit_rule *rule,
|
static inline int audit_del_rule(struct audit_rule *rule,
|
||||||
struct list_head *list)
|
struct list_head *list)
|
||||||
{
|
{
|
||||||
struct audit_entry *e;
|
struct audit_entry *e;
|
||||||
|
|
||||||
write_lock(&auditsc_lock);
|
write_lock(&auditsc_lock);
|
||||||
list_for_each_entry(e, list, list) {
|
list_for_each_entry(e, list, list) {
|
||||||
|
@ -113,9 +162,9 @@ Following are the RCU equivalents for these two functions::
|
||||||
static inline int audit_del_rule(struct audit_rule *rule,
|
static inline int audit_del_rule(struct audit_rule *rule,
|
||||||
struct list_head *list)
|
struct list_head *list)
|
||||||
{
|
{
|
||||||
struct audit_entry *e;
|
struct audit_entry *e;
|
||||||
|
|
||||||
/* Do not use the _rcu iterator here, since this is the only
|
/* No need to use the _rcu iterator here, since this is the only
|
||||||
* deletion routine. */
|
* deletion routine. */
|
||||||
list_for_each_entry(e, list, list) {
|
list_for_each_entry(e, list, list) {
|
||||||
if (!audit_compare_rule(rule, &e->rule)) {
|
if (!audit_compare_rule(rule, &e->rule)) {
|
||||||
|
@ -139,45 +188,45 @@ Following are the RCU equivalents for these two functions::
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
Normally, the write_lock() and write_unlock() would be replaced by
|
Normally, the ``write_lock()`` and ``write_unlock()`` would be replaced by a
|
||||||
a spin_lock() and a spin_unlock(), but in this case, all callers hold
|
spin_lock() and a spin_unlock(). But in this case, all callers hold
|
||||||
audit_netlink_sem, so no additional locking is required. The auditsc_lock
|
``audit_filter_mutex``, so no additional locking is required. The
|
||||||
can therefore be eliminated, since use of RCU eliminates the need for
|
``auditsc_lock`` can therefore be eliminated, since use of RCU eliminates the
|
||||||
writers to exclude readers. Normally, the write_lock() calls would
|
need for writers to exclude readers.
|
||||||
be converted into spin_lock() calls.
|
|
||||||
|
|
||||||
The list_del(), list_add(), and list_add_tail() primitives have been
|
The list_del(), list_add(), and list_add_tail() primitives have been
|
||||||
replaced by list_del_rcu(), list_add_rcu(), and list_add_tail_rcu().
|
replaced by list_del_rcu(), list_add_rcu(), and list_add_tail_rcu().
|
||||||
The _rcu() list-manipulation primitives add memory barriers that are
|
The **_rcu()** list-manipulation primitives add memory barriers that are needed on
|
||||||
needed on weakly ordered CPUs (most of them!). The list_del_rcu()
|
weakly ordered CPUs (most of them!). The list_del_rcu() primitive omits the
|
||||||
primitive omits the pointer poisoning debug-assist code that would
|
pointer poisoning debug-assist code that would otherwise cause concurrent
|
||||||
otherwise cause concurrent readers to fail spectacularly.
|
readers to fail spectacularly.
|
||||||
|
|
||||||
So, when readers can tolerate stale data and when entries are either added
|
So, when readers can tolerate stale data and when entries are either added or
|
||||||
or deleted, without in-place modification, it is very easy to use RCU!
|
deleted, without in-place modification, it is very easy to use RCU!
|
||||||
|
|
||||||
Example 2: Handling In-Place Updates
|
|
||||||
|
Example 3: Handling In-Place Updates
|
||||||
------------------------------------
|
------------------------------------
|
||||||
|
|
||||||
The system-call auditing code does not update auditing rules in place.
|
The system-call auditing code does not update auditing rules in place. However,
|
||||||
However, if it did, reader-writer-locked code to do so might look as
|
if it did, the reader-writer-locked code to do so might look as follows
|
||||||
follows (presumably, the field_count is only permitted to decrease,
|
(assuming only ``field_count`` is updated, otherwise, the added fields would
|
||||||
otherwise, the added fields would need to be filled in)::
|
need to be filled in)::
|
||||||
|
|
||||||
static inline int audit_upd_rule(struct audit_rule *rule,
|
static inline int audit_upd_rule(struct audit_rule *rule,
|
||||||
struct list_head *list,
|
struct list_head *list,
|
||||||
__u32 newaction,
|
__u32 newaction,
|
||||||
__u32 newfield_count)
|
__u32 newfield_count)
|
||||||
{
|
{
|
||||||
struct audit_entry *e;
|
struct audit_entry *e;
|
||||||
struct audit_newentry *ne;
|
struct audit_entry *ne;
|
||||||
|
|
||||||
write_lock(&auditsc_lock);
|
write_lock(&auditsc_lock);
|
||||||
/* Note: audit_netlink_sem held by caller. */
|
/* Note: audit_filter_mutex held by caller. */
|
||||||
list_for_each_entry(e, list, list) {
|
list_for_each_entry(e, list, list) {
|
||||||
if (!audit_compare_rule(rule, &e->rule)) {
|
if (!audit_compare_rule(rule, &e->rule)) {
|
||||||
e->rule.action = newaction;
|
e->rule.action = newaction;
|
||||||
e->rule.file_count = newfield_count;
|
e->rule.field_count = newfield_count;
|
||||||
write_unlock(&auditsc_lock);
|
write_unlock(&auditsc_lock);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -188,16 +237,16 @@ otherwise, the added fields would need to be filled in)::
|
||||||
|
|
||||||
The RCU version creates a copy, updates the copy, then replaces the old
|
The RCU version creates a copy, updates the copy, then replaces the old
|
||||||
entry with the newly updated entry. This sequence of actions, allowing
|
entry with the newly updated entry. This sequence of actions, allowing
|
||||||
concurrent reads while doing a copy to perform an update, is what gives
|
concurrent reads while making a copy to perform an update, is what gives
|
||||||
RCU ("read-copy update") its name. The RCU code is as follows::
|
RCU (*read-copy update*) its name. The RCU code is as follows::
|
||||||
|
|
||||||
static inline int audit_upd_rule(struct audit_rule *rule,
|
static inline int audit_upd_rule(struct audit_rule *rule,
|
||||||
struct list_head *list,
|
struct list_head *list,
|
||||||
__u32 newaction,
|
__u32 newaction,
|
||||||
__u32 newfield_count)
|
__u32 newfield_count)
|
||||||
{
|
{
|
||||||
struct audit_entry *e;
|
struct audit_entry *e;
|
||||||
struct audit_newentry *ne;
|
struct audit_entry *ne;
|
||||||
|
|
||||||
list_for_each_entry(e, list, list) {
|
list_for_each_entry(e, list, list) {
|
||||||
if (!audit_compare_rule(rule, &e->rule)) {
|
if (!audit_compare_rule(rule, &e->rule)) {
|
||||||
|
@ -206,7 +255,7 @@ RCU ("read-copy update") its name. The RCU code is as follows::
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
audit_copy_rule(&ne->rule, &e->rule);
|
audit_copy_rule(&ne->rule, &e->rule);
|
||||||
ne->rule.action = newaction;
|
ne->rule.action = newaction;
|
||||||
ne->rule.file_count = newfield_count;
|
ne->rule.field_count = newfield_count;
|
||||||
list_replace_rcu(&e->list, &ne->list);
|
list_replace_rcu(&e->list, &ne->list);
|
||||||
call_rcu(&e->rcu, audit_free_rule);
|
call_rcu(&e->rcu, audit_free_rule);
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -215,34 +264,45 @@ RCU ("read-copy update") its name. The RCU code is as follows::
|
||||||
return -EFAULT; /* No matching rule */
|
return -EFAULT; /* No matching rule */
|
||||||
}
|
}
|
||||||
|
|
||||||
Again, this assumes that the caller holds audit_netlink_sem. Normally,
|
Again, this assumes that the caller holds ``audit_filter_mutex``. Normally, the
|
||||||
the reader-writer lock would become a spinlock in this sort of code.
|
writer lock would become a spinlock in this sort of code.
|
||||||
|
|
||||||
Example 3: Eliminating Stale Data
|
Another use of this pattern can be found in the openswitch driver's *connection
|
||||||
|
tracking table* code in ``ct_limit_set()``. The table holds connection tracking
|
||||||
|
entries and has a limit on the maximum entries. There is one such table
|
||||||
|
per-zone and hence one *limit* per zone. The zones are mapped to their limits
|
||||||
|
through a hashtable using an RCU-managed hlist for the hash chains. When a new
|
||||||
|
limit is set, a new limit object is allocated and ``ct_limit_set()`` is called
|
||||||
|
to replace the old limit object with the new one using list_replace_rcu().
|
||||||
|
The old limit object is then freed after a grace period using kfree_rcu().
|
||||||
|
|
||||||
|
|
||||||
|
Example 4: Eliminating Stale Data
|
||||||
---------------------------------
|
---------------------------------
|
||||||
|
|
||||||
The auditing examples above tolerate stale data, as do most algorithms
|
The auditing example above tolerates stale data, as do most algorithms
|
||||||
that are tracking external state. Because there is a delay from the
|
that are tracking external state. Because there is a delay from the
|
||||||
time the external state changes before Linux becomes aware of the change,
|
time the external state changes before Linux becomes aware of the change,
|
||||||
additional RCU-induced staleness is normally not a problem.
|
additional RCU-induced staleness is generally not a problem.
|
||||||
|
|
||||||
However, there are many examples where stale data cannot be tolerated.
|
However, there are many examples where stale data cannot be tolerated.
|
||||||
One example in the Linux kernel is the System V IPC (see the ipc_lock()
|
One example in the Linux kernel is the System V IPC (see the shm_lock()
|
||||||
function in ipc/util.c). This code checks a "deleted" flag under a
|
function in ipc/shm.c). This code checks a *deleted* flag under a
|
||||||
per-entry spinlock, and, if the "deleted" flag is set, pretends that the
|
per-entry spinlock, and, if the *deleted* flag is set, pretends that the
|
||||||
entry does not exist. For this to be helpful, the search function must
|
entry does not exist. For this to be helpful, the search function must
|
||||||
return holding the per-entry spinlock, as ipc_lock() does in fact do.
|
return holding the per-entry spinlock, as shm_lock() does in fact do.
|
||||||
|
|
||||||
|
.. _quick_quiz:
|
||||||
|
|
||||||
Quick Quiz:
|
Quick Quiz:
|
||||||
Why does the search function need to return holding the per-entry lock for
|
For the deleted-flag technique to be helpful, why is it necessary
|
||||||
this deleted-flag technique to be helpful?
|
to hold the per-entry lock while returning from the search function?
|
||||||
|
|
||||||
:ref:`Answer to Quick Quiz <answer_quick_quiz_list>`
|
:ref:`Answer to Quick Quiz <quick_quiz_answer>`
|
||||||
|
|
||||||
If the system-call audit module were to ever need to reject stale data,
|
If the system-call audit module were to ever need to reject stale data, one way
|
||||||
one way to accomplish this would be to add a "deleted" flag and a "lock"
|
to accomplish this would be to add a ``deleted`` flag and a ``lock`` spinlock to the
|
||||||
spinlock to the audit_entry structure, and modify audit_filter_task()
|
audit_entry structure, and modify ``audit_filter_task()`` as follows::
|
||||||
as follows::
|
|
||||||
|
|
||||||
static enum audit_state audit_filter_task(struct task_struct *tsk)
|
static enum audit_state audit_filter_task(struct task_struct *tsk)
|
||||||
{
|
{
|
||||||
|
@ -267,20 +327,20 @@ as follows::
|
||||||
}
|
}
|
||||||
|
|
||||||
Note that this example assumes that entries are only added and deleted.
|
Note that this example assumes that entries are only added and deleted.
|
||||||
Additional mechanism is required to deal correctly with the
|
Additional mechanism is required to deal correctly with the update-in-place
|
||||||
update-in-place performed by audit_upd_rule(). For one thing,
|
performed by ``audit_upd_rule()``. For one thing, ``audit_upd_rule()`` would
|
||||||
audit_upd_rule() would need additional memory barriers to ensure
|
need additional memory barriers to ensure that the list_add_rcu() was really
|
||||||
that the list_add_rcu() was really executed before the list_del_rcu().
|
executed before the list_del_rcu().
|
||||||
|
|
||||||
The audit_del_rule() function would need to set the "deleted"
|
The ``audit_del_rule()`` function would need to set the ``deleted`` flag under the
|
||||||
flag under the spinlock as follows::
|
spinlock as follows::
|
||||||
|
|
||||||
static inline int audit_del_rule(struct audit_rule *rule,
|
static inline int audit_del_rule(struct audit_rule *rule,
|
||||||
struct list_head *list)
|
struct list_head *list)
|
||||||
{
|
{
|
||||||
struct audit_entry *e;
|
struct audit_entry *e;
|
||||||
|
|
||||||
/* Do not need to use the _rcu iterator here, since this
|
/* No need to use the _rcu iterator here, since this
|
||||||
* is the only deletion routine. */
|
* is the only deletion routine. */
|
||||||
list_for_each_entry(e, list, list) {
|
list_for_each_entry(e, list, list) {
|
||||||
if (!audit_compare_rule(rule, &e->rule)) {
|
if (!audit_compare_rule(rule, &e->rule)) {
|
||||||
|
@ -295,6 +355,91 @@ flag under the spinlock as follows::
|
||||||
return -EFAULT; /* No matching rule */
|
return -EFAULT; /* No matching rule */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
This too assumes that the caller holds ``audit_filter_mutex``.
|
||||||
|
|
||||||
|
|
||||||
|
Example 5: Skipping Stale Objects
|
||||||
|
---------------------------------
|
||||||
|
|
||||||
|
For some usecases, reader performance can be improved by skipping stale objects
|
||||||
|
during read-side list traversal if the object in concern is pending destruction
|
||||||
|
after one or more grace periods. One such example can be found in the timerfd
|
||||||
|
subsystem. When a ``CLOCK_REALTIME`` clock is reprogrammed - for example due to
|
||||||
|
setting of the system time, then all programmed timerfds that depend on this
|
||||||
|
clock get triggered and processes waiting on them to expire are woken up in
|
||||||
|
advance of their scheduled expiry. To facilitate this, all such timers are added
|
||||||
|
to an RCU-managed ``cancel_list`` when they are setup in
|
||||||
|
``timerfd_setup_cancel()``::
|
||||||
|
|
||||||
|
static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
|
||||||
|
{
|
||||||
|
spin_lock(&ctx->cancel_lock);
|
||||||
|
if ((ctx->clockid == CLOCK_REALTIME &&
|
||||||
|
(flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) {
|
||||||
|
if (!ctx->might_cancel) {
|
||||||
|
ctx->might_cancel = true;
|
||||||
|
spin_lock(&cancel_lock);
|
||||||
|
list_add_rcu(&ctx->clist, &cancel_list);
|
||||||
|
spin_unlock(&cancel_lock);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
spin_unlock(&ctx->cancel_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
When a timerfd is freed (fd is closed), then the ``might_cancel`` flag of the
|
||||||
|
timerfd object is cleared, the object removed from the ``cancel_list`` and
|
||||||
|
destroyed::
|
||||||
|
|
||||||
|
int timerfd_release(struct inode *inode, struct file *file)
|
||||||
|
{
|
||||||
|
struct timerfd_ctx *ctx = file->private_data;
|
||||||
|
|
||||||
|
spin_lock(&ctx->cancel_lock);
|
||||||
|
if (ctx->might_cancel) {
|
||||||
|
ctx->might_cancel = false;
|
||||||
|
spin_lock(&cancel_lock);
|
||||||
|
list_del_rcu(&ctx->clist);
|
||||||
|
spin_unlock(&cancel_lock);
|
||||||
|
}
|
||||||
|
spin_unlock(&ctx->cancel_lock);
|
||||||
|
|
||||||
|
hrtimer_cancel(&ctx->t.tmr);
|
||||||
|
kfree_rcu(ctx, rcu);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
If the ``CLOCK_REALTIME`` clock is set, for example by a time server, the
|
||||||
|
hrtimer framework calls ``timerfd_clock_was_set()`` which walks the
|
||||||
|
``cancel_list`` and wakes up processes waiting on the timerfd. While iterating
|
||||||
|
the ``cancel_list``, the ``might_cancel`` flag is consulted to skip stale
|
||||||
|
objects::
|
||||||
|
|
||||||
|
void timerfd_clock_was_set(void)
|
||||||
|
{
|
||||||
|
struct timerfd_ctx *ctx;
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
|
rcu_read_lock();
|
||||||
|
list_for_each_entry_rcu(ctx, &cancel_list, clist) {
|
||||||
|
if (!ctx->might_cancel)
|
||||||
|
continue;
|
||||||
|
spin_lock_irqsave(&ctx->wqh.lock, flags);
|
||||||
|
if (ctx->moffs != ktime_mono_to_real(0)) {
|
||||||
|
ctx->moffs = KTIME_MAX;
|
||||||
|
ctx->ticks++;
|
||||||
|
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
|
||||||
|
}
|
||||||
|
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
|
||||||
|
}
|
||||||
|
rcu_read_unlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
The key point here is, because RCU-traversal of the ``cancel_list`` happens
|
||||||
|
while objects are being added and removed to the list, sometimes the traversal
|
||||||
|
can step on an object that has been removed from the list. In this example, it
|
||||||
|
is seen that it is better to skip such objects using a flag.
|
||||||
|
|
||||||
|
|
||||||
Summary
|
Summary
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
@ -303,19 +448,21 @@ the most amenable to use of RCU. The simplest case is where entries are
|
||||||
either added or deleted from the data structure (or atomically modified
|
either added or deleted from the data structure (or atomically modified
|
||||||
in place), but non-atomic in-place modifications can be handled by making
|
in place), but non-atomic in-place modifications can be handled by making
|
||||||
a copy, updating the copy, then replacing the original with the copy.
|
a copy, updating the copy, then replacing the original with the copy.
|
||||||
If stale data cannot be tolerated, then a "deleted" flag may be used
|
If stale data cannot be tolerated, then a *deleted* flag may be used
|
||||||
in conjunction with a per-entry spinlock in order to allow the search
|
in conjunction with a per-entry spinlock in order to allow the search
|
||||||
function to reject newly deleted data.
|
function to reject newly deleted data.
|
||||||
|
|
||||||
.. _answer_quick_quiz_list:
|
.. _quick_quiz_answer:
|
||||||
|
|
||||||
Answer to Quick Quiz:
|
Answer to Quick Quiz:
|
||||||
Why does the search function need to return holding the per-entry
|
For the deleted-flag technique to be helpful, why is it necessary
|
||||||
lock for this deleted-flag technique to be helpful?
|
to hold the per-entry lock while returning from the search function?
|
||||||
|
|
||||||
If the search function drops the per-entry lock before returning,
|
If the search function drops the per-entry lock before returning,
|
||||||
then the caller will be processing stale data in any case. If it
|
then the caller will be processing stale data in any case. If it
|
||||||
is really OK to be processing stale data, then you don't need a
|
is really OK to be processing stale data, then you don't need a
|
||||||
"deleted" flag. If processing stale data really is a problem,
|
*deleted* flag. If processing stale data really is a problem,
|
||||||
then you need to hold the per-entry lock across all of the code
|
then you need to hold the per-entry lock across all of the code
|
||||||
that uses the value that was returned.
|
that uses the value that was returned.
|
||||||
|
|
||||||
|
:ref:`Back to Quick Quiz <quick_quiz>`
|
||||||
|
|
|
@ -11,8 +11,8 @@ must be long enough that any readers accessing the item being deleted have
|
||||||
since dropped their references. For example, an RCU-protected deletion
|
since dropped their references. For example, an RCU-protected deletion
|
||||||
from a linked list would first remove the item from the list, wait for
|
from a linked list would first remove the item from the list, wait for
|
||||||
a grace period to elapse, then free the element. See the
|
a grace period to elapse, then free the element. See the
|
||||||
Documentation/RCU/listRCU.rst file for more information on using RCU with
|
:ref:`Documentation/RCU/listRCU.rst <list_rcu_doc>` for more information on
|
||||||
linked lists.
|
using RCU with linked lists.
|
||||||
|
|
||||||
Frequently Asked Questions
|
Frequently Asked Questions
|
||||||
--------------------------
|
--------------------------
|
||||||
|
@ -50,7 +50,7 @@ Frequently Asked Questions
|
||||||
- If I am running on a uniprocessor kernel, which can only do one
|
- If I am running on a uniprocessor kernel, which can only do one
|
||||||
thing at a time, why should I wait for a grace period?
|
thing at a time, why should I wait for a grace period?
|
||||||
|
|
||||||
See the Documentation/RCU/UP.rst file for more information.
|
See :ref:`Documentation/RCU/UP.rst <up_doc>` for more information.
|
||||||
|
|
||||||
- How can I see where RCU is currently used in the Linux kernel?
|
- How can I see where RCU is currently used in the Linux kernel?
|
||||||
|
|
||||||
|
@ -68,18 +68,18 @@ Frequently Asked Questions
|
||||||
|
|
||||||
- Why the name "RCU"?
|
- Why the name "RCU"?
|
||||||
|
|
||||||
"RCU" stands for "read-copy update". The file Documentation/RCU/listRCU.rst
|
"RCU" stands for "read-copy update".
|
||||||
has more information on where this name came from, search for
|
:ref:`Documentation/RCU/listRCU.rst <list_rcu_doc>` has more information on where
|
||||||
"read-copy update" to find it.
|
this name came from, search for "read-copy update" to find it.
|
||||||
|
|
||||||
- I hear that RCU is patented? What is with that?
|
- I hear that RCU is patented? What is with that?
|
||||||
|
|
||||||
Yes, it is. There are several known patents related to RCU,
|
Yes, it is. There are several known patents related to RCU,
|
||||||
search for the string "Patent" in RTFP.txt to find them.
|
search for the string "Patent" in Documentation/RCU/RTFP.txt to find them.
|
||||||
Of these, one was allowed to lapse by the assignee, and the
|
Of these, one was allowed to lapse by the assignee, and the
|
||||||
others have been contributed to the Linux kernel under GPL.
|
others have been contributed to the Linux kernel under GPL.
|
||||||
There are now also LGPL implementations of user-level RCU
|
There are now also LGPL implementations of user-level RCU
|
||||||
available (http://liburcu.org/).
|
available (https://liburcu.org/).
|
||||||
|
|
||||||
- I hear that RCU needs work in order to support realtime kernels?
|
- I hear that RCU needs work in order to support realtime kernels?
|
||||||
|
|
||||||
|
@ -88,5 +88,5 @@ Frequently Asked Questions
|
||||||
|
|
||||||
- Where can I find more information on RCU?
|
- Where can I find more information on RCU?
|
||||||
|
|
||||||
See the RTFP.txt file in this directory.
|
See the Documentation/RCU/RTFP.txt file.
|
||||||
Or point your browser at (http://www.rdrop.com/users/paulmck/RCU/).
|
Or point your browser at (http://www.rdrop.com/users/paulmck/RCU/).
|
||||||
|
|
|
@ -124,9 +124,14 @@ using a dynamically allocated srcu_struct (hence "srcud-" rather than
|
||||||
debugging. The final "T" entry contains the totals of the counters.
|
debugging. The final "T" entry contains the totals of the counters.
|
||||||
|
|
||||||
|
|
||||||
USAGE
|
USAGE ON SPECIFIC KERNEL BUILDS
|
||||||
|
|
||||||
The following script may be used to torture RCU:
|
It is sometimes desirable to torture RCU on a specific kernel build,
|
||||||
|
for example, when preparing to put that kernel build into production.
|
||||||
|
In that case, the kernel should be built with CONFIG_RCU_TORTURE_TEST=m
|
||||||
|
so that the test can be started using modprobe and terminated using rmmod.
|
||||||
|
|
||||||
|
For example, the following script may be used to torture RCU:
|
||||||
|
|
||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
|
|
||||||
|
@ -142,8 +147,136 @@ checked for such errors. The "rmmod" command forces a "SUCCESS",
|
||||||
two are self-explanatory, while the last indicates that while there
|
two are self-explanatory, while the last indicates that while there
|
||||||
were no RCU failures, CPU-hotplug problems were detected.
|
were no RCU failures, CPU-hotplug problems were detected.
|
||||||
|
|
||||||
However, the tools/testing/selftests/rcutorture/bin/kvm.sh script
|
|
||||||
provides better automation, including automatic failure analysis.
|
USAGE ON MAINLINE KERNELS
|
||||||
It assumes a qemu/kvm-enabled platform, and runs guest OSes out of initrd.
|
|
||||||
See tools/testing/selftests/rcutorture/doc/initrd.txt for instructions
|
When using rcutorture to test changes to RCU itself, it is often
|
||||||
on setting up such an initrd.
|
necessary to build a number of kernels in order to test that change
|
||||||
|
across a broad range of combinations of the relevant Kconfig options
|
||||||
|
and of the relevant kernel boot parameters. In this situation, use
|
||||||
|
of modprobe and rmmod can be quite time-consuming and error-prone.
|
||||||
|
|
||||||
|
Therefore, the tools/testing/selftests/rcutorture/bin/kvm.sh
|
||||||
|
script is available for mainline testing for x86, arm64, and
|
||||||
|
powerpc. By default, it will run the series of tests specified by
|
||||||
|
tools/testing/selftests/rcutorture/configs/rcu/CFLIST, with each test
|
||||||
|
running for 30 minutes within a guest OS using a minimal userspace
|
||||||
|
supplied by an automatically generated initrd. After the tests are
|
||||||
|
complete, the resulting build products and console output are analyzed
|
||||||
|
for errors and the results of the runs are summarized.
|
||||||
|
|
||||||
|
On larger systems, rcutorture testing can be accelerated by passing the
|
||||||
|
--cpus argument to kvm.sh. For example, on a 64-CPU system, "--cpus 43"
|
||||||
|
would use up to 43 CPUs to run tests concurrently, which as of v5.4 would
|
||||||
|
complete all the scenarios in two batches, reducing the time to complete
|
||||||
|
from about eight hours to about one hour (not counting the time to build
|
||||||
|
the sixteen kernels). The "--dryrun sched" argument will not run tests,
|
||||||
|
but rather tell you how the tests would be scheduled into batches. This
|
||||||
|
can be useful when working out how many CPUs to specify in the --cpus
|
||||||
|
argument.
|
||||||
|
|
||||||
|
Not all changes require that all scenarios be run. For example, a change
|
||||||
|
to Tree SRCU might run only the SRCU-N and SRCU-P scenarios using the
|
||||||
|
--configs argument to kvm.sh as follows: "--configs 'SRCU-N SRCU-P'".
|
||||||
|
Large systems can run multiple copies of of the full set of scenarios,
|
||||||
|
for example, a system with 448 hardware threads can run five instances
|
||||||
|
of the full set concurrently. To make this happen:
|
||||||
|
|
||||||
|
kvm.sh --cpus 448 --configs '5*CFLIST'
|
||||||
|
|
||||||
|
Alternatively, such a system can run 56 concurrent instances of a single
|
||||||
|
eight-CPU scenario:
|
||||||
|
|
||||||
|
kvm.sh --cpus 448 --configs '56*TREE04'
|
||||||
|
|
||||||
|
Or 28 concurrent instances of each of two eight-CPU scenarios:
|
||||||
|
|
||||||
|
kvm.sh --cpus 448 --configs '28*TREE03 28*TREE04'
|
||||||
|
|
||||||
|
Of course, each concurrent instance will use memory, which can be
|
||||||
|
limited using the --memory argument, which defaults to 512M. Small
|
||||||
|
values for memory may require disabling the callback-flooding tests
|
||||||
|
using the --bootargs parameter discussed below.
|
||||||
|
|
||||||
|
Sometimes additional debugging is useful, and in such cases the --kconfig
|
||||||
|
parameter to kvm.sh may be used, for example, "--kconfig 'CONFIG_KASAN=y'".
|
||||||
|
|
||||||
|
Kernel boot arguments can also be supplied, for example, to control
|
||||||
|
rcutorture's module parameters. For example, to test a change to RCU's
|
||||||
|
CPU stall-warning code, use "--bootargs 'rcutorture.stall_cpu=30'".
|
||||||
|
This will of course result in the scripting reporting a failure, namely
|
||||||
|
the resuling RCU CPU stall warning. As noted above, reducing memory may
|
||||||
|
require disabling rcutorture's callback-flooding tests:
|
||||||
|
|
||||||
|
kvm.sh --cpus 448 --configs '56*TREE04' --memory 128M \
|
||||||
|
--bootargs 'rcutorture.fwd_progress=0'
|
||||||
|
|
||||||
|
Sometimes all that is needed is a full set of kernel builds. This is
|
||||||
|
what the --buildonly argument does.
|
||||||
|
|
||||||
|
Finally, the --trust-make argument allows each kernel build to reuse what
|
||||||
|
it can from the previous kernel build.
|
||||||
|
|
||||||
|
There are additional more arcane arguments that are documented in the
|
||||||
|
source code of the kvm.sh script.
|
||||||
|
|
||||||
|
If a run contains failures, the number of buildtime and runtime failures
|
||||||
|
is listed at the end of the kvm.sh output, which you really should redirect
|
||||||
|
to a file. The build products and console output of each run is kept in
|
||||||
|
tools/testing/selftests/rcutorture/res in timestamped directories. A
|
||||||
|
given directory can be supplied to kvm-find-errors.sh in order to have
|
||||||
|
it cycle you through summaries of errors and full error logs. For example:
|
||||||
|
|
||||||
|
tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh \
|
||||||
|
tools/testing/selftests/rcutorture/res/2020.01.20-15.54.23
|
||||||
|
|
||||||
|
However, it is often more convenient to access the files directly.
|
||||||
|
Files pertaining to all scenarios in a run reside in the top-level
|
||||||
|
directory (2020.01.20-15.54.23 in the example above), while per-scenario
|
||||||
|
files reside in a subdirectory named after the scenario (for example,
|
||||||
|
"TREE04"). If a given scenario ran more than once (as in "--configs
|
||||||
|
'56*TREE04'" above), the directories corresponding to the second and
|
||||||
|
subsequent runs of that scenario include a sequence number, for example,
|
||||||
|
"TREE04.2", "TREE04.3", and so on.
|
||||||
|
|
||||||
|
The most frequently used file in the top-level directory is testid.txt.
|
||||||
|
If the test ran in a git repository, then this file contains the commit
|
||||||
|
that was tested and any uncommitted changes in diff format.
|
||||||
|
|
||||||
|
The most frequently used files in each per-scenario-run directory are:
|
||||||
|
|
||||||
|
.config: This file contains the Kconfig options.
|
||||||
|
|
||||||
|
Make.out: This contains build output for a specific scenario.
|
||||||
|
|
||||||
|
console.log: This contains the console output for a specific scenario.
|
||||||
|
This file may be examined once the kernel has booted, but
|
||||||
|
it might not exist if the build failed.
|
||||||
|
|
||||||
|
vmlinux: This contains the kernel, which can be useful with tools like
|
||||||
|
objdump and gdb.
|
||||||
|
|
||||||
|
A number of additional files are available, but are less frequently used.
|
||||||
|
Many are intended for debugging of rcutorture itself or of its scripting.
|
||||||
|
|
||||||
|
As of v5.4, a successful run with the default set of scenarios produces
|
||||||
|
the following summary at the end of the run on a 12-CPU system:
|
||||||
|
|
||||||
|
SRCU-N ------- 804233 GPs (148.932/s) [srcu: g10008272 f0x0 ]
|
||||||
|
SRCU-P ------- 202320 GPs (37.4667/s) [srcud: g1809476 f0x0 ]
|
||||||
|
SRCU-t ------- 1122086 GPs (207.794/s) [srcu: g0 f0x0 ]
|
||||||
|
SRCU-u ------- 1111285 GPs (205.794/s) [srcud: g1 f0x0 ]
|
||||||
|
TASKS01 ------- 19666 GPs (3.64185/s) [tasks: g0 f0x0 ]
|
||||||
|
TASKS02 ------- 20541 GPs (3.80389/s) [tasks: g0 f0x0 ]
|
||||||
|
TASKS03 ------- 19416 GPs (3.59556/s) [tasks: g0 f0x0 ]
|
||||||
|
TINY01 ------- 836134 GPs (154.84/s) [rcu: g0 f0x0 ] n_max_cbs: 34198
|
||||||
|
TINY02 ------- 850371 GPs (157.476/s) [rcu: g0 f0x0 ] n_max_cbs: 2631
|
||||||
|
TREE01 ------- 162625 GPs (30.1157/s) [rcu: g1124169 f0x0 ]
|
||||||
|
TREE02 ------- 333003 GPs (61.6672/s) [rcu: g2647753 f0x0 ] n_max_cbs: 35844
|
||||||
|
TREE03 ------- 306623 GPs (56.782/s) [rcu: g2975325 f0x0 ] n_max_cbs: 1496497
|
||||||
|
CPU count limited from 16 to 12
|
||||||
|
TREE04 ------- 246149 GPs (45.5831/s) [rcu: g1695737 f0x0 ] n_max_cbs: 434961
|
||||||
|
TREE05 ------- 314603 GPs (58.2598/s) [rcu: g2257741 f0x2 ] n_max_cbs: 193997
|
||||||
|
TREE07 ------- 167347 GPs (30.9902/s) [rcu: g1079021 f0x0 ] n_max_cbs: 478732
|
||||||
|
CPU count limited from 16 to 12
|
||||||
|
TREE09 ------- 752238 GPs (139.303/s) [rcu: g13075057 f0x0 ] n_max_cbs: 99011
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
.. _psi:
|
||||||
|
|
||||||
================================
|
================================
|
||||||
PSI - Pressure Stall Information
|
PSI - Pressure Stall Information
|
||||||
================================
|
================================
|
||||||
|
|
|
@ -18,7 +18,7 @@ may look as follows::
|
||||||
|
|
||||||
$ ls -l /sys/bus/acpi/devices/INT3404:00/
|
$ ls -l /sys/bus/acpi/devices/INT3404:00/
|
||||||
total 0
|
total 0
|
||||||
...
|
...
|
||||||
-r--r--r-- 1 root root 4096 Dec 13 20:38 state0
|
-r--r--r-- 1 root root 4096 Dec 13 20:38 state0
|
||||||
-r--r--r-- 1 root root 4096 Dec 13 20:38 state1
|
-r--r--r-- 1 root root 4096 Dec 13 20:38 state1
|
||||||
-r--r--r-- 1 root root 4096 Dec 13 20:38 state10
|
-r--r--r-- 1 root root 4096 Dec 13 20:38 state10
|
||||||
|
@ -38,7 +38,7 @@ where each of the "state*" files represents one performance state of the fan
|
||||||
and contains a colon-separated list of 5 integer numbers (fields) with the
|
and contains a colon-separated list of 5 integer numbers (fields) with the
|
||||||
following interpretation::
|
following interpretation::
|
||||||
|
|
||||||
control_percent:trip_point_index:speed_rpm:noise_level_mdb:power_mw
|
control_percent:trip_point_index:speed_rpm:noise_level_mdb:power_mw
|
||||||
|
|
||||||
* ``control_percent``: The percent value to be used to set the fan speed to a
|
* ``control_percent``: The percent value to be used to set the fan speed to a
|
||||||
specific level using the _FSL object (0-100).
|
specific level using the _FSL object (0-100).
|
||||||
|
|
|
@ -33,6 +33,12 @@ max
|
||||||
a per-instance limit. If ``max=<count>`` is set then only ``<count>`` number
|
a per-instance limit. If ``max=<count>`` is set then only ``<count>`` number
|
||||||
of binder devices can be allocated in this binderfs instance.
|
of binder devices can be allocated in this binderfs instance.
|
||||||
|
|
||||||
|
stats
|
||||||
|
Using ``stats=global`` enables global binder statistics.
|
||||||
|
``stats=global`` is only available for a binderfs instance mounted in the
|
||||||
|
initial user namespace. An attempt to use the option to mount a binderfs
|
||||||
|
instance in another user namespace will return a permission error.
|
||||||
|
|
||||||
Allocating binder Devices
|
Allocating binder Devices
|
||||||
-------------------------
|
-------------------------
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
Kernel Support for miscellaneous (your favourite) Binary Formats v1.1
|
Kernel Support for miscellaneous Binary Formats (binfmt_misc)
|
||||||
=====================================================================
|
=============================================================
|
||||||
|
|
||||||
This Kernel feature allows you to invoke almost (for restrictions see below)
|
This Kernel feature allows you to invoke almost (for restrictions see below)
|
||||||
every program by simply typing its name in the shell.
|
every program by simply typing its name in the shell.
|
||||||
|
@ -140,8 +140,8 @@ Hints
|
||||||
-----
|
-----
|
||||||
|
|
||||||
If you want to pass special arguments to your interpreter, you can
|
If you want to pass special arguments to your interpreter, you can
|
||||||
write a wrapper script for it. See Documentation/admin-guide/java.rst for an
|
write a wrapper script for it.
|
||||||
example.
|
See :doc:`Documentation/admin-guide/java.rst <./java>` for an example.
|
||||||
|
|
||||||
Your interpreter should NOT look in the PATH for the filename; the kernel
|
Your interpreter should NOT look in the PATH for the filename; the kernel
|
||||||
passes it the full filename (or the file descriptor) to use. Using ``$PATH`` can
|
passes it the full filename (or the file descriptor) to use. Using ``$PATH`` can
|
||||||
|
|
|
@ -251,8 +251,6 @@ line of text and contains the following stats separated by whitespace:
|
||||||
|
|
||||||
================ =============================================================
|
================ =============================================================
|
||||||
orig_data_size uncompressed size of data stored in this disk.
|
orig_data_size uncompressed size of data stored in this disk.
|
||||||
This excludes same-element-filled pages (same_pages) since
|
|
||||||
no memory is allocated for them.
|
|
||||||
Unit: bytes
|
Unit: bytes
|
||||||
compr_data_size compressed size of data stored in this disk
|
compr_data_size compressed size of data stored in this disk
|
||||||
mem_used_total the amount of memory allocated for this disk. This
|
mem_used_total the amount of memory allocated for this disk. This
|
||||||
|
|
|
@ -23,7 +23,7 @@ of dot-connected-words, and key and value are connected by ``=``. The value
|
||||||
has to be terminated by semi-colon (``;``) or newline (``\n``).
|
has to be terminated by semi-colon (``;``) or newline (``\n``).
|
||||||
For array value, array entries are separated by comma (``,``). ::
|
For array value, array entries are separated by comma (``,``). ::
|
||||||
|
|
||||||
KEY[.WORD[...]] = VALUE[, VALUE2[...]][;]
|
KEY[.WORD[...]] = VALUE[, VALUE2[...]][;]
|
||||||
|
|
||||||
Unlike the kernel command line syntax, spaces are OK around the comma and ``=``.
|
Unlike the kernel command line syntax, spaces are OK around the comma and ``=``.
|
||||||
|
|
||||||
|
@ -62,6 +62,30 @@ Or more shorter, written as following::
|
||||||
In both styles, same key words are automatically merged when parsing it
|
In both styles, same key words are automatically merged when parsing it
|
||||||
at boot time. So you can append similar trees or key-values.
|
at boot time. So you can append similar trees or key-values.
|
||||||
|
|
||||||
|
Same-key Values
|
||||||
|
---------------
|
||||||
|
|
||||||
|
It is prohibited that two or more values or arrays share a same-key.
|
||||||
|
For example,::
|
||||||
|
|
||||||
|
foo = bar, baz
|
||||||
|
foo = qux # !ERROR! we can not re-define same key
|
||||||
|
|
||||||
|
If you want to append the value to existing key as an array member,
|
||||||
|
you can use ``+=`` operator. For example::
|
||||||
|
|
||||||
|
foo = bar, baz
|
||||||
|
foo += qux
|
||||||
|
|
||||||
|
In this case, the key ``foo`` has ``bar``, ``baz`` and ``qux``.
|
||||||
|
|
||||||
|
However, a sub-key and a value can not co-exist under a parent key.
|
||||||
|
For example, following config is NOT allowed.::
|
||||||
|
|
||||||
|
foo = value1
|
||||||
|
foo.bar = value2 # !ERROR! subkey "bar" and value "value1" can NOT co-exist
|
||||||
|
|
||||||
|
|
||||||
Comments
|
Comments
|
||||||
--------
|
--------
|
||||||
|
|
||||||
|
@ -102,9 +126,13 @@ Boot Kernel With a Boot Config
|
||||||
==============================
|
==============================
|
||||||
|
|
||||||
Since the boot configuration file is loaded with initrd, it will be added
|
Since the boot configuration file is loaded with initrd, it will be added
|
||||||
to the end of the initrd (initramfs) image file. The Linux kernel decodes
|
to the end of the initrd (initramfs) image file with size, checksum and
|
||||||
the last part of the initrd image in memory to get the boot configuration
|
12-byte magic word as below.
|
||||||
data.
|
|
||||||
|
[initrd][bootconfig][size(u32)][checksum(u32)][#BOOTCONFIG\n]
|
||||||
|
|
||||||
|
The Linux kernel decodes the last part of the initrd image in memory to
|
||||||
|
get the boot configuration data.
|
||||||
Because of this "piggyback" method, there is no need to change or
|
Because of this "piggyback" method, there is no need to change or
|
||||||
update the boot loader and the kernel image itself.
|
update the boot loader and the kernel image itself.
|
||||||
|
|
||||||
|
|
|
@ -223,6 +223,17 @@ cpu_online_mask using a CPU hotplug notifier, and the mems file
|
||||||
automatically tracks the value of node_states[N_MEMORY]--i.e.,
|
automatically tracks the value of node_states[N_MEMORY]--i.e.,
|
||||||
nodes with memory--using the cpuset_track_online_nodes() hook.
|
nodes with memory--using the cpuset_track_online_nodes() hook.
|
||||||
|
|
||||||
|
The cpuset.effective_cpus and cpuset.effective_mems files are
|
||||||
|
normally read-only copies of cpuset.cpus and cpuset.mems files
|
||||||
|
respectively. If the cpuset cgroup filesystem is mounted with the
|
||||||
|
special "cpuset_v2_mode" option, the behavior of these files will become
|
||||||
|
similar to the corresponding files in cpuset v2. In other words, hotplug
|
||||||
|
events will not change cpuset.cpus and cpuset.mems. Those events will
|
||||||
|
only affect cpuset.effective_cpus and cpuset.effective_mems which show
|
||||||
|
the actual cpus and memory nodes that are currently used by this cpuset.
|
||||||
|
See Documentation/admin-guide/cgroup-v2.rst for more information about
|
||||||
|
cpuset v2 behavior.
|
||||||
|
|
||||||
|
|
||||||
1.4 What are exclusive cpusets ?
|
1.4 What are exclusive cpusets ?
|
||||||
--------------------------------
|
--------------------------------
|
||||||
|
|
|
@ -2,13 +2,6 @@
|
||||||
HugeTLB Controller
|
HugeTLB Controller
|
||||||
==================
|
==================
|
||||||
|
|
||||||
The HugeTLB controller allows to limit the HugeTLB usage per control group and
|
|
||||||
enforces the controller limit during page fault. Since HugeTLB doesn't
|
|
||||||
support page reclaim, enforcing the limit at page fault time implies that,
|
|
||||||
the application will get SIGBUS signal if it tries to access HugeTLB pages
|
|
||||||
beyond its limit. This requires the application to know beforehand how much
|
|
||||||
HugeTLB pages it would require for its use.
|
|
||||||
|
|
||||||
HugeTLB controller can be created by first mounting the cgroup filesystem.
|
HugeTLB controller can be created by first mounting the cgroup filesystem.
|
||||||
|
|
||||||
# mount -t cgroup -o hugetlb none /sys/fs/cgroup
|
# mount -t cgroup -o hugetlb none /sys/fs/cgroup
|
||||||
|
@ -28,10 +21,14 @@ process (bash) into it.
|
||||||
|
|
||||||
Brief summary of control files::
|
Brief summary of control files::
|
||||||
|
|
||||||
hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage
|
hugetlb.<hugepagesize>.rsvd.limit_in_bytes # set/show limit of "hugepagesize" hugetlb reservations
|
||||||
hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded
|
hugetlb.<hugepagesize>.rsvd.max_usage_in_bytes # show max "hugepagesize" hugetlb reservations and no-reserve faults
|
||||||
hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb
|
hugetlb.<hugepagesize>.rsvd.usage_in_bytes # show current reservations and no-reserve faults for "hugepagesize" hugetlb
|
||||||
hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit
|
hugetlb.<hugepagesize>.rsvd.failcnt # show the number of allocation failure due to HugeTLB reservation limit
|
||||||
|
hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb faults
|
||||||
|
hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded
|
||||||
|
hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb
|
||||||
|
hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB usage limit
|
||||||
|
|
||||||
For a system supporting three hugepage sizes (64k, 32M and 1G), the control
|
For a system supporting three hugepage sizes (64k, 32M and 1G), the control
|
||||||
files include::
|
files include::
|
||||||
|
@ -40,11 +37,95 @@ files include::
|
||||||
hugetlb.1GB.max_usage_in_bytes
|
hugetlb.1GB.max_usage_in_bytes
|
||||||
hugetlb.1GB.usage_in_bytes
|
hugetlb.1GB.usage_in_bytes
|
||||||
hugetlb.1GB.failcnt
|
hugetlb.1GB.failcnt
|
||||||
|
hugetlb.1GB.rsvd.limit_in_bytes
|
||||||
|
hugetlb.1GB.rsvd.max_usage_in_bytes
|
||||||
|
hugetlb.1GB.rsvd.usage_in_bytes
|
||||||
|
hugetlb.1GB.rsvd.failcnt
|
||||||
hugetlb.64KB.limit_in_bytes
|
hugetlb.64KB.limit_in_bytes
|
||||||
hugetlb.64KB.max_usage_in_bytes
|
hugetlb.64KB.max_usage_in_bytes
|
||||||
hugetlb.64KB.usage_in_bytes
|
hugetlb.64KB.usage_in_bytes
|
||||||
hugetlb.64KB.failcnt
|
hugetlb.64KB.failcnt
|
||||||
|
hugetlb.64KB.rsvd.limit_in_bytes
|
||||||
|
hugetlb.64KB.rsvd.max_usage_in_bytes
|
||||||
|
hugetlb.64KB.rsvd.usage_in_bytes
|
||||||
|
hugetlb.64KB.rsvd.failcnt
|
||||||
hugetlb.32MB.limit_in_bytes
|
hugetlb.32MB.limit_in_bytes
|
||||||
hugetlb.32MB.max_usage_in_bytes
|
hugetlb.32MB.max_usage_in_bytes
|
||||||
hugetlb.32MB.usage_in_bytes
|
hugetlb.32MB.usage_in_bytes
|
||||||
hugetlb.32MB.failcnt
|
hugetlb.32MB.failcnt
|
||||||
|
hugetlb.32MB.rsvd.limit_in_bytes
|
||||||
|
hugetlb.32MB.rsvd.max_usage_in_bytes
|
||||||
|
hugetlb.32MB.rsvd.usage_in_bytes
|
||||||
|
hugetlb.32MB.rsvd.failcnt
|
||||||
|
|
||||||
|
|
||||||
|
1. Page fault accounting
|
||||||
|
|
||||||
|
hugetlb.<hugepagesize>.limit_in_bytes
|
||||||
|
hugetlb.<hugepagesize>.max_usage_in_bytes
|
||||||
|
hugetlb.<hugepagesize>.usage_in_bytes
|
||||||
|
hugetlb.<hugepagesize>.failcnt
|
||||||
|
|
||||||
|
The HugeTLB controller allows users to limit the HugeTLB usage (page fault) per
|
||||||
|
control group and enforces the limit during page fault. Since HugeTLB
|
||||||
|
doesn't support page reclaim, enforcing the limit at page fault time implies
|
||||||
|
that, the application will get SIGBUS signal if it tries to fault in HugeTLB
|
||||||
|
pages beyond its limit. Therefore the application needs to know exactly how many
|
||||||
|
HugeTLB pages it uses before hand, and the sysadmin needs to make sure that
|
||||||
|
there are enough available on the machine for all the users to avoid processes
|
||||||
|
getting SIGBUS.
|
||||||
|
|
||||||
|
|
||||||
|
2. Reservation accounting
|
||||||
|
|
||||||
|
hugetlb.<hugepagesize>.rsvd.limit_in_bytes
|
||||||
|
hugetlb.<hugepagesize>.rsvd.max_usage_in_bytes
|
||||||
|
hugetlb.<hugepagesize>.rsvd.usage_in_bytes
|
||||||
|
hugetlb.<hugepagesize>.rsvd.failcnt
|
||||||
|
|
||||||
|
The HugeTLB controller allows to limit the HugeTLB reservations per control
|
||||||
|
group and enforces the controller limit at reservation time and at the fault of
|
||||||
|
HugeTLB memory for which no reservation exists. Since reservation limits are
|
||||||
|
enforced at reservation time (on mmap or shget), reservation limits never causes
|
||||||
|
the application to get SIGBUS signal if the memory was reserved before hand. For
|
||||||
|
MAP_NORESERVE allocations, the reservation limit behaves the same as the fault
|
||||||
|
limit, enforcing memory usage at fault time and causing the application to
|
||||||
|
receive a SIGBUS if it's crossing its limit.
|
||||||
|
|
||||||
|
Reservation limits are superior to page fault limits described above, since
|
||||||
|
reservation limits are enforced at reservation time (on mmap or shget), and
|
||||||
|
never causes the application to get SIGBUS signal if the memory was reserved
|
||||||
|
before hand. This allows for easier fallback to alternatives such as
|
||||||
|
non-HugeTLB memory for example. In the case of page fault accounting, it's very
|
||||||
|
hard to avoid processes getting SIGBUS since the sysadmin needs precisely know
|
||||||
|
the HugeTLB usage of all the tasks in the system and make sure there is enough
|
||||||
|
pages to satisfy all requests. Avoiding tasks getting SIGBUS on overcommited
|
||||||
|
systems is practically impossible with page fault accounting.
|
||||||
|
|
||||||
|
|
||||||
|
3. Caveats with shared memory
|
||||||
|
|
||||||
|
For shared HugeTLB memory, both HugeTLB reservation and page faults are charged
|
||||||
|
to the first task that causes the memory to be reserved or faulted, and all
|
||||||
|
subsequent uses of this reserved or faulted memory is done without charging.
|
||||||
|
|
||||||
|
Shared HugeTLB memory is only uncharged when it is unreserved or deallocated.
|
||||||
|
This is usually when the HugeTLB file is deleted, and not when the task that
|
||||||
|
caused the reservation or fault has exited.
|
||||||
|
|
||||||
|
|
||||||
|
4. Caveats with HugeTLB cgroup offline.
|
||||||
|
|
||||||
|
When a HugeTLB cgroup goes offline with some reservations or faults still
|
||||||
|
charged to it, the behavior is as follows:
|
||||||
|
|
||||||
|
- The fault charges are charged to the parent HugeTLB cgroup (reparented),
|
||||||
|
- the reservation charges remain on the offline HugeTLB cgroup.
|
||||||
|
|
||||||
|
This means that if a HugeTLB cgroup gets offlined while there is still HugeTLB
|
||||||
|
reservations charged to it, that cgroup persists as a zombie until all HugeTLB
|
||||||
|
reservations are uncharged. HugeTLB reservations behave in this manner to match
|
||||||
|
the memory controller whose cgroups also persist as zombie until all charged
|
||||||
|
memory is uncharged. Also, the tracking of HugeTLB reservations is a bit more
|
||||||
|
complex compared to the tracking of HugeTLB faults, so it is significantly
|
||||||
|
harder to reparent reservations at offline time.
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
.. _cgroup-v1:
|
||||||
|
|
||||||
========================
|
========================
|
||||||
Control Groups version 1
|
Control Groups version 1
|
||||||
========================
|
========================
|
||||||
|
|
|
@ -9,7 +9,7 @@ This is the authoritative documentation on the design, interface and
|
||||||
conventions of cgroup v2. It describes all userland-visible aspects
|
conventions of cgroup v2. It describes all userland-visible aspects
|
||||||
of cgroup including core and specific controller behaviors. All
|
of cgroup including core and specific controller behaviors. All
|
||||||
future changes must be reflected in this document. Documentation for
|
future changes must be reflected in this document. Documentation for
|
||||||
v1 is available under Documentation/admin-guide/cgroup-v1/.
|
v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgroup-v1>`.
|
||||||
|
|
||||||
.. CONTENTS
|
.. CONTENTS
|
||||||
|
|
||||||
|
@ -188,6 +188,17 @@ cgroup v2 currently supports the following mount options.
|
||||||
modified through remount from the init namespace. The mount
|
modified through remount from the init namespace. The mount
|
||||||
option is ignored on non-init namespace mounts.
|
option is ignored on non-init namespace mounts.
|
||||||
|
|
||||||
|
memory_recursiveprot
|
||||||
|
|
||||||
|
Recursively apply memory.min and memory.low protection to
|
||||||
|
entire subtrees, without requiring explicit downward
|
||||||
|
propagation into leaf cgroups. This allows protecting entire
|
||||||
|
subtrees from one another, while retaining free competition
|
||||||
|
within those subtrees. This should have been the default
|
||||||
|
behavior but is a mount-option to avoid regressing setups
|
||||||
|
relying on the original semantics (e.g. specifying bogusly
|
||||||
|
high 'bypass' protection values at higher tree levels).
|
||||||
|
|
||||||
|
|
||||||
Organizing Processes and Threads
|
Organizing Processes and Threads
|
||||||
--------------------------------
|
--------------------------------
|
||||||
|
@ -1023,7 +1034,7 @@ All time durations are in microseconds.
|
||||||
A read-only nested-key file which exists on non-root cgroups.
|
A read-only nested-key file which exists on non-root cgroups.
|
||||||
|
|
||||||
Shows pressure stall information for CPU. See
|
Shows pressure stall information for CPU. See
|
||||||
Documentation/accounting/psi.rst for details.
|
:ref:`Documentation/accounting/psi.rst <psi>` for details.
|
||||||
|
|
||||||
cpu.uclamp.min
|
cpu.uclamp.min
|
||||||
A read-write single value file which exists on non-root cgroups.
|
A read-write single value file which exists on non-root cgroups.
|
||||||
|
@ -1103,7 +1114,7 @@ PAGE_SIZE multiple when read back.
|
||||||
proportionally to the overage, reducing reclaim pressure for
|
proportionally to the overage, reducing reclaim pressure for
|
||||||
smaller overages.
|
smaller overages.
|
||||||
|
|
||||||
Effective min boundary is limited by memory.min values of
|
Effective min boundary is limited by memory.min values of
|
||||||
all ancestor cgroups. If there is memory.min overcommitment
|
all ancestor cgroups. If there is memory.min overcommitment
|
||||||
(child cgroup or cgroups are requiring more protected memory
|
(child cgroup or cgroups are requiring more protected memory
|
||||||
than parent will allow), then each child cgroup will get
|
than parent will allow), then each child cgroup will get
|
||||||
|
@ -1313,53 +1324,41 @@ PAGE_SIZE multiple when read back.
|
||||||
Number of major page faults incurred
|
Number of major page faults incurred
|
||||||
|
|
||||||
workingset_refault
|
workingset_refault
|
||||||
|
|
||||||
Number of refaults of previously evicted pages
|
Number of refaults of previously evicted pages
|
||||||
|
|
||||||
workingset_activate
|
workingset_activate
|
||||||
|
|
||||||
Number of refaulted pages that were immediately activated
|
Number of refaulted pages that were immediately activated
|
||||||
|
|
||||||
workingset_nodereclaim
|
workingset_nodereclaim
|
||||||
|
|
||||||
Number of times a shadow node has been reclaimed
|
Number of times a shadow node has been reclaimed
|
||||||
|
|
||||||
pgrefill
|
pgrefill
|
||||||
|
|
||||||
Amount of scanned pages (in an active LRU list)
|
Amount of scanned pages (in an active LRU list)
|
||||||
|
|
||||||
pgscan
|
pgscan
|
||||||
|
|
||||||
Amount of scanned pages (in an inactive LRU list)
|
Amount of scanned pages (in an inactive LRU list)
|
||||||
|
|
||||||
pgsteal
|
pgsteal
|
||||||
|
|
||||||
Amount of reclaimed pages
|
Amount of reclaimed pages
|
||||||
|
|
||||||
pgactivate
|
pgactivate
|
||||||
|
|
||||||
Amount of pages moved to the active LRU list
|
Amount of pages moved to the active LRU list
|
||||||
|
|
||||||
pgdeactivate
|
pgdeactivate
|
||||||
|
|
||||||
Amount of pages moved to the inactive LRU list
|
Amount of pages moved to the inactive LRU list
|
||||||
|
|
||||||
pglazyfree
|
pglazyfree
|
||||||
|
|
||||||
Amount of pages postponed to be freed under memory pressure
|
Amount of pages postponed to be freed under memory pressure
|
||||||
|
|
||||||
pglazyfreed
|
pglazyfreed
|
||||||
|
|
||||||
Amount of reclaimed lazyfree pages
|
Amount of reclaimed lazyfree pages
|
||||||
|
|
||||||
thp_fault_alloc
|
thp_fault_alloc
|
||||||
|
|
||||||
Number of transparent hugepages which were allocated to satisfy
|
Number of transparent hugepages which were allocated to satisfy
|
||||||
a page fault, including COW faults. This counter is not present
|
a page fault, including COW faults. This counter is not present
|
||||||
when CONFIG_TRANSPARENT_HUGEPAGE is not set.
|
when CONFIG_TRANSPARENT_HUGEPAGE is not set.
|
||||||
|
|
||||||
thp_collapse_alloc
|
thp_collapse_alloc
|
||||||
|
|
||||||
Number of transparent hugepages which were allocated to allow
|
Number of transparent hugepages which were allocated to allow
|
||||||
collapsing an existing range of pages. This counter is not
|
collapsing an existing range of pages. This counter is not
|
||||||
present when CONFIG_TRANSPARENT_HUGEPAGE is not set.
|
present when CONFIG_TRANSPARENT_HUGEPAGE is not set.
|
||||||
|
@ -1403,7 +1402,7 @@ PAGE_SIZE multiple when read back.
|
||||||
A read-only nested-key file which exists on non-root cgroups.
|
A read-only nested-key file which exists on non-root cgroups.
|
||||||
|
|
||||||
Shows pressure stall information for memory. See
|
Shows pressure stall information for memory. See
|
||||||
Documentation/accounting/psi.rst for details.
|
:ref:`Documentation/accounting/psi.rst <psi>` for details.
|
||||||
|
|
||||||
|
|
||||||
Usage Guidelines
|
Usage Guidelines
|
||||||
|
@ -1478,7 +1477,7 @@ IO Interface Files
|
||||||
dios Number of discard IOs
|
dios Number of discard IOs
|
||||||
====== =====================
|
====== =====================
|
||||||
|
|
||||||
An example read output follows:
|
An example read output follows::
|
||||||
|
|
||||||
8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 dbytes=0 dios=0
|
8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 dbytes=0 dios=0
|
||||||
8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 dbytes=50331648 dios=3021
|
8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 dbytes=50331648 dios=3021
|
||||||
|
@ -1643,7 +1642,7 @@ IO Interface Files
|
||||||
A read-only nested-key file which exists on non-root cgroups.
|
A read-only nested-key file which exists on non-root cgroups.
|
||||||
|
|
||||||
Shows pressure stall information for IO. See
|
Shows pressure stall information for IO. See
|
||||||
Documentation/accounting/psi.rst for details.
|
:ref:`Documentation/accounting/psi.rst <psi>` for details.
|
||||||
|
|
||||||
|
|
||||||
Writeback
|
Writeback
|
||||||
|
@ -1853,7 +1852,7 @@ Cpuset Interface Files
|
||||||
from the requested CPUs.
|
from the requested CPUs.
|
||||||
|
|
||||||
The CPU numbers are comma-separated numbers or ranges.
|
The CPU numbers are comma-separated numbers or ranges.
|
||||||
For example:
|
For example::
|
||||||
|
|
||||||
# cat cpuset.cpus
|
# cat cpuset.cpus
|
||||||
0-4,6,8-10
|
0-4,6,8-10
|
||||||
|
@ -1892,7 +1891,7 @@ Cpuset Interface Files
|
||||||
from the requested memory nodes.
|
from the requested memory nodes.
|
||||||
|
|
||||||
The memory node numbers are comma-separated numbers or ranges.
|
The memory node numbers are comma-separated numbers or ranges.
|
||||||
For example:
|
For example::
|
||||||
|
|
||||||
# cat cpuset.mems
|
# cat cpuset.mems
|
||||||
0-1,3
|
0-1,3
|
||||||
|
|
|
@ -54,6 +54,9 @@ If you make a mistake with the syntax, the write will fail thus::
|
||||||
<debugfs>/dynamic_debug/control
|
<debugfs>/dynamic_debug/control
|
||||||
-bash: echo: write error: Invalid argument
|
-bash: echo: write error: Invalid argument
|
||||||
|
|
||||||
|
Note, for systems without 'debugfs' enabled, the control file can be
|
||||||
|
found in ``/proc/dynamic_debug/control``.
|
||||||
|
|
||||||
Viewing Dynamic Debug Behaviour
|
Viewing Dynamic Debug Behaviour
|
||||||
===============================
|
===============================
|
||||||
|
|
||||||
|
|
|
@ -11,11 +11,13 @@ Today, with the advent of Kernel Mode Setting, a graphics board is
|
||||||
either correctly working because all components follow the standards -
|
either correctly working because all components follow the standards -
|
||||||
or the computer is unusable, because the screen remains dark after
|
or the computer is unusable, because the screen remains dark after
|
||||||
booting or it displays the wrong area. Cases when this happens are:
|
booting or it displays the wrong area. Cases when this happens are:
|
||||||
|
|
||||||
- The graphics board does not recognize the monitor.
|
- The graphics board does not recognize the monitor.
|
||||||
- The graphics board is unable to detect any EDID data.
|
- The graphics board is unable to detect any EDID data.
|
||||||
- The graphics board incorrectly forwards EDID data to the driver.
|
- The graphics board incorrectly forwards EDID data to the driver.
|
||||||
- The monitor sends no or bogus EDID data.
|
- The monitor sends no or bogus EDID data.
|
||||||
- A KVM sends its own EDID data instead of querying the connected monitor.
|
- A KVM sends its own EDID data instead of querying the connected monitor.
|
||||||
|
|
||||||
Adding the kernel parameter "nomodeset" helps in most cases, but causes
|
Adding the kernel parameter "nomodeset" helps in most cases, but causes
|
||||||
restrictions later on.
|
restrictions later on.
|
||||||
|
|
||||||
|
@ -32,7 +34,7 @@ individual data for a specific misbehaving monitor, commented sources
|
||||||
and a Makefile environment are given here.
|
and a Makefile environment are given here.
|
||||||
|
|
||||||
To create binary EDID and C source code files from the existing data
|
To create binary EDID and C source code files from the existing data
|
||||||
material, simply type "make".
|
material, simply type "make" in tools/edid/.
|
||||||
|
|
||||||
If you want to create your own EDID file, copy the file 1024x768.S,
|
If you want to create your own EDID file, copy the file 1024x768.S,
|
||||||
replace the settings with your own data and add a new target to the
|
replace the settings with your own data and add a new target to the
|
|
@ -136,8 +136,6 @@ enables the mitigation by default.
|
||||||
The mitigation can be controlled at boot time via a kernel command line option.
|
The mitigation can be controlled at boot time via a kernel command line option.
|
||||||
See :ref:`taa_mitigation_control_command_line`.
|
See :ref:`taa_mitigation_control_command_line`.
|
||||||
|
|
||||||
.. _virt_mechanism:
|
|
||||||
|
|
||||||
Virtualization mitigation
|
Virtualization mitigation
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
|
|
@ -75,6 +75,7 @@ configure specific aspects of kernel behavior to your liking.
|
||||||
cputopology
|
cputopology
|
||||||
dell_rbu
|
dell_rbu
|
||||||
device-mapper/index
|
device-mapper/index
|
||||||
|
edid
|
||||||
efi-stub
|
efi-stub
|
||||||
ext4
|
ext4
|
||||||
nfs/index
|
nfs/index
|
||||||
|
|
|
@ -100,7 +100,7 @@ Field 10 -- # of milliseconds spent doing I/Os (unsigned int)
|
||||||
|
|
||||||
Since 5.0 this field counts jiffies when at least one request was
|
Since 5.0 this field counts jiffies when at least one request was
|
||||||
started or completed. If request runs more than 2 jiffies then some
|
started or completed. If request runs more than 2 jiffies then some
|
||||||
I/O time will not be accounted unless there are other requests.
|
I/O time might be not accounted in case of concurrent requests.
|
||||||
|
|
||||||
Field 11 -- weighted # of milliseconds spent doing I/Os (unsigned int)
|
Field 11 -- weighted # of milliseconds spent doing I/Os (unsigned int)
|
||||||
This field is incremented at each I/O start, I/O completion, I/O
|
This field is incremented at each I/O start, I/O completion, I/O
|
||||||
|
@ -143,6 +143,9 @@ are summed (possibly overflowing the unsigned long variable they are
|
||||||
summed to) and the result given to the user. There is no convenient
|
summed to) and the result given to the user. There is no convenient
|
||||||
user interface for accessing the per-CPU counters themselves.
|
user interface for accessing the per-CPU counters themselves.
|
||||||
|
|
||||||
|
Since 4.19 request times are measured with nanoseconds precision and
|
||||||
|
truncated to milliseconds before showing in this interface.
|
||||||
|
|
||||||
Disks vs Partitions
|
Disks vs Partitions
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
|
|
|
@ -22,11 +22,13 @@
|
||||||
default: 0
|
default: 0
|
||||||
|
|
||||||
acpi_backlight= [HW,ACPI]
|
acpi_backlight= [HW,ACPI]
|
||||||
acpi_backlight=vendor
|
{ vendor | video | native | none }
|
||||||
acpi_backlight=video
|
If set to vendor, prefer vendor-specific driver
|
||||||
If set to vendor, prefer vendor specific driver
|
|
||||||
(e.g. thinkpad_acpi, sony_acpi, etc.) instead
|
(e.g. thinkpad_acpi, sony_acpi, etc.) instead
|
||||||
of the ACPI video.ko driver.
|
of the ACPI video.ko driver.
|
||||||
|
If set to video, use the ACPI video.ko driver.
|
||||||
|
If set to native, use the device's native backlight mode.
|
||||||
|
If set to none, disable the ACPI backlight interface.
|
||||||
|
|
||||||
acpi_force_32bit_fadt_addr
|
acpi_force_32bit_fadt_addr
|
||||||
force FADT to use 32 bit addresses rather than the
|
force FADT to use 32 bit addresses rather than the
|
||||||
|
@ -136,6 +138,10 @@
|
||||||
dynamic table installation which will install SSDT
|
dynamic table installation which will install SSDT
|
||||||
tables to /sys/firmware/acpi/tables/dynamic.
|
tables to /sys/firmware/acpi/tables/dynamic.
|
||||||
|
|
||||||
|
acpi_no_watchdog [HW,ACPI,WDT]
|
||||||
|
Ignore the ACPI-based watchdog interface (WDAT) and let
|
||||||
|
a native driver control the watchdog device instead.
|
||||||
|
|
||||||
acpi_rsdp= [ACPI,EFI,KEXEC]
|
acpi_rsdp= [ACPI,EFI,KEXEC]
|
||||||
Pass the RSDP address to the kernel, mostly used
|
Pass the RSDP address to the kernel, mostly used
|
||||||
on machines running EFI runtime service to boot the
|
on machines running EFI runtime service to boot the
|
||||||
|
@ -446,6 +452,9 @@
|
||||||
bert_disable [ACPI]
|
bert_disable [ACPI]
|
||||||
Disable BERT OS support on buggy BIOSes.
|
Disable BERT OS support on buggy BIOSes.
|
||||||
|
|
||||||
|
bgrt_disable [ACPI][X86]
|
||||||
|
Disable BGRT to avoid flickering OEM logo.
|
||||||
|
|
||||||
bttv.card= [HW,V4L] bttv (bt848 + bt878 based grabber cards)
|
bttv.card= [HW,V4L] bttv (bt848 + bt878 based grabber cards)
|
||||||
bttv.radio= Most important insmod options are available as
|
bttv.radio= Most important insmod options are available as
|
||||||
kernel args too.
|
kernel args too.
|
||||||
|
@ -518,6 +527,7 @@
|
||||||
Default value is set via a kernel config option.
|
Default value is set via a kernel config option.
|
||||||
Value can be changed at runtime via
|
Value can be changed at runtime via
|
||||||
/sys/fs/selinux/checkreqprot.
|
/sys/fs/selinux/checkreqprot.
|
||||||
|
Setting checkreqprot to 1 is deprecated.
|
||||||
|
|
||||||
cio_ignore= [S390]
|
cio_ignore= [S390]
|
||||||
See Documentation/s390/common_io.rst for details.
|
See Documentation/s390/common_io.rst for details.
|
||||||
|
@ -675,7 +685,7 @@
|
||||||
coredump_filter=
|
coredump_filter=
|
||||||
[KNL] Change the default value for
|
[KNL] Change the default value for
|
||||||
/proc/<pid>/coredump_filter.
|
/proc/<pid>/coredump_filter.
|
||||||
See also Documentation/filesystems/proc.txt.
|
See also Documentation/filesystems/proc.rst.
|
||||||
|
|
||||||
coresight_cpu_debug.enable
|
coresight_cpu_debug.enable
|
||||||
[ARM,ARM64]
|
[ARM,ARM64]
|
||||||
|
@ -952,7 +962,7 @@
|
||||||
edid/1680x1050.bin, or edid/1920x1080.bin is given
|
edid/1680x1050.bin, or edid/1920x1080.bin is given
|
||||||
and no file with the same name exists. Details and
|
and no file with the same name exists. Details and
|
||||||
instructions how to build your own EDID data are
|
instructions how to build your own EDID data are
|
||||||
available in Documentation/driver-api/edid.rst. An EDID
|
available in Documentation/admin-guide/edid.rst. An EDID
|
||||||
data set will only be used for a particular connector,
|
data set will only be used for a particular connector,
|
||||||
if its name and a colon are prepended to the EDID
|
if its name and a colon are prepended to the EDID
|
||||||
name. Each connector may use a unique EDID data
|
name. Each connector may use a unique EDID data
|
||||||
|
@ -982,10 +992,6 @@
|
||||||
Documentation/admin-guide/dynamic-debug-howto.rst
|
Documentation/admin-guide/dynamic-debug-howto.rst
|
||||||
for details.
|
for details.
|
||||||
|
|
||||||
nompx [X86] Disables Intel Memory Protection Extensions.
|
|
||||||
See Documentation/x86/intel_mpx.rst for more
|
|
||||||
information about the feature.
|
|
||||||
|
|
||||||
nopku [X86] Disable Memory Protection Keys CPU feature found
|
nopku [X86] Disable Memory Protection Keys CPU feature found
|
||||||
in some Intel CPUs.
|
in some Intel CPUs.
|
||||||
|
|
||||||
|
@ -1095,6 +1101,12 @@
|
||||||
A valid base address must be provided, and the serial
|
A valid base address must be provided, and the serial
|
||||||
port must already be setup and configured.
|
port must already be setup and configured.
|
||||||
|
|
||||||
|
ec_imx21,<addr>
|
||||||
|
ec_imx6q,<addr>
|
||||||
|
Start an early, polled-mode, output-only console on the
|
||||||
|
Freescale i.MX UART at the specified address. The UART
|
||||||
|
must already be setup and configured.
|
||||||
|
|
||||||
ar3700_uart,<addr>
|
ar3700_uart,<addr>
|
||||||
Start an early, polled-mode console on the
|
Start an early, polled-mode console on the
|
||||||
Armada 3700 serial port at the specified
|
Armada 3700 serial port at the specified
|
||||||
|
@ -1350,6 +1362,24 @@
|
||||||
can be changed at run time by the max_graph_depth file
|
can be changed at run time by the max_graph_depth file
|
||||||
in the tracefs tracing directory. default: 0 (no limit)
|
in the tracefs tracing directory. default: 0 (no limit)
|
||||||
|
|
||||||
|
fw_devlink= [KNL] Create device links between consumer and supplier
|
||||||
|
devices by scanning the firmware to infer the
|
||||||
|
consumer/supplier relationships. This feature is
|
||||||
|
especially useful when drivers are loaded as modules as
|
||||||
|
it ensures proper ordering of tasks like device probing
|
||||||
|
(suppliers first, then consumers), supplier boot state
|
||||||
|
clean up (only after all consumers have probed),
|
||||||
|
suspend/resume & runtime PM (consumers first, then
|
||||||
|
suppliers).
|
||||||
|
Format: { off | permissive | on | rpm }
|
||||||
|
off -- Don't create device links from firmware info.
|
||||||
|
permissive -- Create device links from firmware info
|
||||||
|
but use it only for ordering boot state clean
|
||||||
|
up (sync_state() calls).
|
||||||
|
on -- Create device links from firmware info and use it
|
||||||
|
to enforce probe and suspend/resume ordering.
|
||||||
|
rpm -- Like "on", but also use to order runtime PM.
|
||||||
|
|
||||||
gamecon.map[2|3]=
|
gamecon.map[2|3]=
|
||||||
[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
|
[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
|
||||||
support via parallel port (up to 5 devices per port)
|
support via parallel port (up to 5 devices per port)
|
||||||
|
@ -1441,6 +1471,14 @@
|
||||||
hpet_mmap= [X86, HPET_MMAP] Allow userspace to mmap HPET
|
hpet_mmap= [X86, HPET_MMAP] Allow userspace to mmap HPET
|
||||||
registers. Default set by CONFIG_HPET_MMAP_DEFAULT.
|
registers. Default set by CONFIG_HPET_MMAP_DEFAULT.
|
||||||
|
|
||||||
|
hugetlb_cma= [HW] The size of a cma area used for allocation
|
||||||
|
of gigantic hugepages.
|
||||||
|
Format: nn[KMGTPE]
|
||||||
|
|
||||||
|
Reserve a cma area of given size and allocate gigantic
|
||||||
|
hugepages using the cma allocator. If enabled, the
|
||||||
|
boot-time allocation of gigantic hugepages is skipped.
|
||||||
|
|
||||||
hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot.
|
hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot.
|
||||||
hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages.
|
hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages.
|
||||||
On x86-64 and powerpc, this option can be specified
|
On x86-64 and powerpc, this option can be specified
|
||||||
|
@ -1775,7 +1813,7 @@
|
||||||
provided by tboot because it makes the system
|
provided by tboot because it makes the system
|
||||||
vulnerable to DMA attacks.
|
vulnerable to DMA attacks.
|
||||||
nobounce [Default off]
|
nobounce [Default off]
|
||||||
Disable bounce buffer for unstrusted devices such as
|
Disable bounce buffer for untrusted devices such as
|
||||||
the Thunderbolt devices. This will treat the untrusted
|
the Thunderbolt devices. This will treat the untrusted
|
||||||
devices as the trusted ones, hence might expose security
|
devices as the trusted ones, hence might expose security
|
||||||
risks of DMA attacks.
|
risks of DMA attacks.
|
||||||
|
@ -1879,7 +1917,7 @@
|
||||||
No delay
|
No delay
|
||||||
|
|
||||||
ip= [IP_PNP]
|
ip= [IP_PNP]
|
||||||
See Documentation/filesystems/nfs/nfsroot.txt.
|
See Documentation/admin-guide/nfs/nfsroot.rst.
|
||||||
|
|
||||||
ipcmni_extend [KNL] Extend the maximum number of unique System V
|
ipcmni_extend [KNL] Extend the maximum number of unique System V
|
||||||
IPC identifiers from 32,768 to 16,777,216.
|
IPC identifiers from 32,768 to 16,777,216.
|
||||||
|
@ -2539,13 +2577,22 @@
|
||||||
For details see: Documentation/admin-guide/hw-vuln/mds.rst
|
For details see: Documentation/admin-guide/hw-vuln/mds.rst
|
||||||
|
|
||||||
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
|
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
|
||||||
Amount of memory to be used when the kernel is not able
|
Amount of memory to be used in cases as follows:
|
||||||
to see the whole system memory or for test.
|
|
||||||
|
1 for test;
|
||||||
|
2 when the kernel is not able to see the whole system memory;
|
||||||
|
3 memory that lies after 'mem=' boundary is excluded from
|
||||||
|
the hypervisor, then assigned to KVM guests.
|
||||||
|
|
||||||
[X86] Work as limiting max address. Use together
|
[X86] Work as limiting max address. Use together
|
||||||
with memmap= to avoid physical address space collisions.
|
with memmap= to avoid physical address space collisions.
|
||||||
Without memmap= PCI devices could be placed at addresses
|
Without memmap= PCI devices could be placed at addresses
|
||||||
belonging to unused RAM.
|
belonging to unused RAM.
|
||||||
|
|
||||||
|
Note that this only takes effects during boot time since
|
||||||
|
in above case 3, memory may need be hot added after boot
|
||||||
|
if system memory of hypervisor is not sufficient.
|
||||||
|
|
||||||
mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel
|
mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel
|
||||||
memory.
|
memory.
|
||||||
|
|
||||||
|
@ -2791,7 +2838,7 @@
|
||||||
<name>,<region-number>[,<base>,<size>,<buswidth>,<altbuswidth>]
|
<name>,<region-number>[,<base>,<size>,<buswidth>,<altbuswidth>]
|
||||||
|
|
||||||
mtdparts= [MTD]
|
mtdparts= [MTD]
|
||||||
See drivers/mtd/cmdlinepart.c.
|
See drivers/mtd/parsers/cmdlinepart.c
|
||||||
|
|
||||||
multitce=off [PPC] This parameter disables the use of the pSeries
|
multitce=off [PPC] This parameter disables the use of the pSeries
|
||||||
firmware feature for updating multiple TCE entries
|
firmware feature for updating multiple TCE entries
|
||||||
|
@ -2849,13 +2896,13 @@
|
||||||
Default value is 0.
|
Default value is 0.
|
||||||
|
|
||||||
nfsaddrs= [NFS] Deprecated. Use ip= instead.
|
nfsaddrs= [NFS] Deprecated. Use ip= instead.
|
||||||
See Documentation/filesystems/nfs/nfsroot.txt.
|
See Documentation/admin-guide/nfs/nfsroot.rst.
|
||||||
|
|
||||||
nfsroot= [NFS] nfs root filesystem for disk-less boxes.
|
nfsroot= [NFS] nfs root filesystem for disk-less boxes.
|
||||||
See Documentation/filesystems/nfs/nfsroot.txt.
|
See Documentation/admin-guide/nfs/nfsroot.rst.
|
||||||
|
|
||||||
nfsrootdebug [NFS] enable nfsroot debugging messages.
|
nfsrootdebug [NFS] enable nfsroot debugging messages.
|
||||||
See Documentation/filesystems/nfs/nfsroot.txt.
|
See Documentation/admin-guide/nfs/nfsroot.rst.
|
||||||
|
|
||||||
nfs.callback_nr_threads=
|
nfs.callback_nr_threads=
|
||||||
[NFSv4] set the total number of threads that the
|
[NFSv4] set the total number of threads that the
|
||||||
|
@ -3170,7 +3217,7 @@
|
||||||
[X86,PV_OPS] Disable paravirtualized VMware scheduler
|
[X86,PV_OPS] Disable paravirtualized VMware scheduler
|
||||||
clock and use the default one.
|
clock and use the default one.
|
||||||
|
|
||||||
no-steal-acc [X86,KVM,ARM64] Disable paravirtualized steal time
|
no-steal-acc [X86,PV_OPS,ARM64] Disable paravirtualized steal time
|
||||||
accounting. steal time is computed, but won't
|
accounting. steal time is computed, but won't
|
||||||
influence scheduler behaviour
|
influence scheduler behaviour
|
||||||
|
|
||||||
|
@ -3281,12 +3328,6 @@
|
||||||
This can be set from sysctl after boot.
|
This can be set from sysctl after boot.
|
||||||
See Documentation/admin-guide/sysctl/vm.rst for details.
|
See Documentation/admin-guide/sysctl/vm.rst for details.
|
||||||
|
|
||||||
of_devlink [OF, KNL] Create device links between consumer and
|
|
||||||
supplier devices by scanning the devictree to infer the
|
|
||||||
consumer/supplier relationships. A consumer device
|
|
||||||
will not be probed until all the supplier devices have
|
|
||||||
probed successfully.
|
|
||||||
|
|
||||||
ohci1394_dma=early [HW] enable debugging via the ohci1394 driver.
|
ohci1394_dma=early [HW] enable debugging via the ohci1394 driver.
|
||||||
See Documentation/debugging-via-ohci1394.txt for more
|
See Documentation/debugging-via-ohci1394.txt for more
|
||||||
info.
|
info.
|
||||||
|
@ -3694,6 +3735,9 @@
|
||||||
Override pmtimer IOPort with a hex value.
|
Override pmtimer IOPort with a hex value.
|
||||||
e.g. pmtmr=0x508
|
e.g. pmtmr=0x508
|
||||||
|
|
||||||
|
pm_debug_messages [SUSPEND,KNL]
|
||||||
|
Enable suspend/resume debug messages during boot up.
|
||||||
|
|
||||||
pnp.debug=1 [PNP]
|
pnp.debug=1 [PNP]
|
||||||
Enable PNP debug messages (depends on the
|
Enable PNP debug messages (depends on the
|
||||||
CONFIG_PNP_DEBUG_MESSAGES option). Change at run-time
|
CONFIG_PNP_DEBUG_MESSAGES option). Change at run-time
|
||||||
|
@ -3795,6 +3839,11 @@
|
||||||
before loading.
|
before loading.
|
||||||
See Documentation/admin-guide/blockdev/ramdisk.rst.
|
See Documentation/admin-guide/blockdev/ramdisk.rst.
|
||||||
|
|
||||||
|
prot_virt= [S390] enable hosting protected virtual machines
|
||||||
|
isolated from the hypervisor (if hardware supports
|
||||||
|
that).
|
||||||
|
Format: <bool>
|
||||||
|
|
||||||
psi= [KNL] Enable or disable pressure stall information
|
psi= [KNL] Enable or disable pressure stall information
|
||||||
tracking.
|
tracking.
|
||||||
Format: <bool>
|
Format: <bool>
|
||||||
|
@ -3980,6 +4029,15 @@
|
||||||
Set threshold of queued RCU callbacks below which
|
Set threshold of queued RCU callbacks below which
|
||||||
batch limiting is re-enabled.
|
batch limiting is re-enabled.
|
||||||
|
|
||||||
|
rcutree.qovld= [KNL]
|
||||||
|
Set threshold of queued RCU callbacks beyond which
|
||||||
|
RCU's force-quiescent-state scan will aggressively
|
||||||
|
enlist help from cond_resched() and sched IPIs to
|
||||||
|
help CPUs more quickly reach quiescent states.
|
||||||
|
Set to less than zero to make this be set based
|
||||||
|
on rcutree.qhimark at boot time and to zero to
|
||||||
|
disable more aggressive help enlistment.
|
||||||
|
|
||||||
rcutree.rcu_idle_gp_delay= [KNL]
|
rcutree.rcu_idle_gp_delay= [KNL]
|
||||||
Set wakeup interval for idle CPUs that have
|
Set wakeup interval for idle CPUs that have
|
||||||
RCU callbacks (RCU_FAST_NO_HZ=y).
|
RCU callbacks (RCU_FAST_NO_HZ=y).
|
||||||
|
@ -4195,6 +4253,12 @@
|
||||||
rcupdate.rcu_cpu_stall_suppress= [KNL]
|
rcupdate.rcu_cpu_stall_suppress= [KNL]
|
||||||
Suppress RCU CPU stall warning messages.
|
Suppress RCU CPU stall warning messages.
|
||||||
|
|
||||||
|
rcupdate.rcu_cpu_stall_suppress_at_boot= [KNL]
|
||||||
|
Suppress RCU CPU stall warning messages and
|
||||||
|
rcutorture writer stall warnings that occur
|
||||||
|
during early boot, that is, during the time
|
||||||
|
before the init task is spawned.
|
||||||
|
|
||||||
rcupdate.rcu_cpu_stall_timeout= [KNL]
|
rcupdate.rcu_cpu_stall_timeout= [KNL]
|
||||||
Set timeout for RCU CPU stall warning messages.
|
Set timeout for RCU CPU stall warning messages.
|
||||||
|
|
||||||
|
@ -4388,6 +4452,22 @@
|
||||||
incurs a small amount of overhead in the scheduler
|
incurs a small amount of overhead in the scheduler
|
||||||
but is useful for debugging and performance tuning.
|
but is useful for debugging and performance tuning.
|
||||||
|
|
||||||
|
sched_thermal_decay_shift=
|
||||||
|
[KNL, SMP] Set a decay shift for scheduler thermal
|
||||||
|
pressure signal. Thermal pressure signal follows the
|
||||||
|
default decay period of other scheduler pelt
|
||||||
|
signals(usually 32 ms but configurable). Setting
|
||||||
|
sched_thermal_decay_shift will left shift the decay
|
||||||
|
period for the thermal pressure signal by the shift
|
||||||
|
value.
|
||||||
|
i.e. with the default pelt decay period of 32 ms
|
||||||
|
sched_thermal_decay_shift thermal pressure decay pr
|
||||||
|
1 64 ms
|
||||||
|
2 128 ms
|
||||||
|
and so on.
|
||||||
|
Format: integer between 0 and 10
|
||||||
|
Default is 0.
|
||||||
|
|
||||||
skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate
|
skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate
|
||||||
xtime_lock contention on larger systems, and/or RCU lock
|
xtime_lock contention on larger systems, and/or RCU lock
|
||||||
contention on all systems with CONFIG_MAXSMP set.
|
contention on all systems with CONFIG_MAXSMP set.
|
||||||
|
@ -4510,10 +4590,10 @@
|
||||||
Format: <integer>
|
Format: <integer>
|
||||||
|
|
||||||
A nonzero value instructs the soft-lockup detector
|
A nonzero value instructs the soft-lockup detector
|
||||||
to panic the machine when a soft-lockup occurs. This
|
to panic the machine when a soft-lockup occurs. It is
|
||||||
is also controlled by CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC
|
also controlled by the kernel.softlockup_panic sysctl
|
||||||
which is the respective build-time switch to that
|
and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the
|
||||||
functionality.
|
respective build-time switch to that functionality.
|
||||||
|
|
||||||
softlockup_all_cpu_backtrace=
|
softlockup_all_cpu_backtrace=
|
||||||
[KNL] Should the soft-lockup detector generate
|
[KNL] Should the soft-lockup detector generate
|
||||||
|
@ -4655,6 +4735,28 @@
|
||||||
spia_pedr=
|
spia_pedr=
|
||||||
spia_peddr=
|
spia_peddr=
|
||||||
|
|
||||||
|
split_lock_detect=
|
||||||
|
[X86] Enable split lock detection
|
||||||
|
|
||||||
|
When enabled (and if hardware support is present), atomic
|
||||||
|
instructions that access data across cache line
|
||||||
|
boundaries will result in an alignment check exception.
|
||||||
|
|
||||||
|
off - not enabled
|
||||||
|
|
||||||
|
warn - the kernel will emit rate limited warnings
|
||||||
|
about applications triggering the #AC
|
||||||
|
exception. This mode is the default on CPUs
|
||||||
|
that supports split lock detection.
|
||||||
|
|
||||||
|
fatal - the kernel will send SIGBUS to applications
|
||||||
|
that trigger the #AC exception.
|
||||||
|
|
||||||
|
If an #AC exception is hit in the kernel or in
|
||||||
|
firmware (i.e. not while executing in user mode)
|
||||||
|
the kernel will oops in either "warn" or "fatal"
|
||||||
|
mode.
|
||||||
|
|
||||||
srcutree.counter_wrap_check [KNL]
|
srcutree.counter_wrap_check [KNL]
|
||||||
Specifies how frequently to check for
|
Specifies how frequently to check for
|
||||||
grace-period sequence counter wrap for the
|
grace-period sequence counter wrap for the
|
||||||
|
@ -4867,6 +4969,10 @@
|
||||||
topology updates sent by the hypervisor to this
|
topology updates sent by the hypervisor to this
|
||||||
LPAR.
|
LPAR.
|
||||||
|
|
||||||
|
torture.disable_onoff_at_boot= [KNL]
|
||||||
|
Prevent the CPU-hotplug component of torturing
|
||||||
|
until after init has spawned.
|
||||||
|
|
||||||
tp720= [HW,PS2]
|
tp720= [HW,PS2]
|
||||||
|
|
||||||
tpm_suspend_pcr=[HW,TPM]
|
tpm_suspend_pcr=[HW,TPM]
|
||||||
|
|
|
@ -234,7 +234,7 @@ To reduce its OS jitter, do any of the following:
|
||||||
Such a workqueue can be confined to a given subset of the
|
Such a workqueue can be confined to a given subset of the
|
||||||
CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs
|
CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs
|
||||||
files. The set of WQ_SYSFS workqueues can be displayed using
|
files. The set of WQ_SYSFS workqueues can be displayed using
|
||||||
"ls sys/devices/virtual/workqueue". That said, the workqueues
|
"ls /sys/devices/virtual/workqueue". That said, the workqueues
|
||||||
maintainer would like to caution people against indiscriminately
|
maintainer would like to caution people against indiscriminately
|
||||||
sprinkling WQ_SYSFS across all the workqueues. The reason for
|
sprinkling WQ_SYSFS across all the workqueues. The reason for
|
||||||
caution is that it is easy to add WQ_SYSFS, but because sysfs is
|
caution is that it is easy to add WQ_SYSFS, but because sysfs is
|
||||||
|
|
|
@ -310,6 +310,11 @@ thp_fault_fallback
|
||||||
is incremented if a page fault fails to allocate
|
is incremented if a page fault fails to allocate
|
||||||
a huge page and instead falls back to using small pages.
|
a huge page and instead falls back to using small pages.
|
||||||
|
|
||||||
|
thp_fault_fallback_charge
|
||||||
|
is incremented if a page fault fails to charge a huge page and
|
||||||
|
instead falls back to using small pages even though the
|
||||||
|
allocation was successful.
|
||||||
|
|
||||||
thp_collapse_alloc_failed
|
thp_collapse_alloc_failed
|
||||||
is incremented if khugepaged found a range
|
is incremented if khugepaged found a range
|
||||||
of pages that should be collapsed into one huge page but failed
|
of pages that should be collapsed into one huge page but failed
|
||||||
|
@ -319,6 +324,15 @@ thp_file_alloc
|
||||||
is incremented every time a file huge page is successfully
|
is incremented every time a file huge page is successfully
|
||||||
allocated.
|
allocated.
|
||||||
|
|
||||||
|
thp_file_fallback
|
||||||
|
is incremented if a file huge page is attempted to be allocated
|
||||||
|
but fails and instead falls back to using small pages.
|
||||||
|
|
||||||
|
thp_file_fallback_charge
|
||||||
|
is incremented if a file huge page cannot be charged and instead
|
||||||
|
falls back to using small pages even though the allocation was
|
||||||
|
successful.
|
||||||
|
|
||||||
thp_file_mapped
|
thp_file_mapped
|
||||||
is incremented every time a file huge page is mapped into
|
is incremented every time a file huge page is mapped into
|
||||||
user address space.
|
user address space.
|
||||||
|
|
|
@ -108,6 +108,57 @@ UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an
|
||||||
half copied page since it'll keep userfaulting until the copy has
|
half copied page since it'll keep userfaulting until the copy has
|
||||||
finished.
|
finished.
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
|
||||||
|
- If you requested UFFDIO_REGISTER_MODE_MISSING when registering then
|
||||||
|
you must provide some kind of page in your thread after reading from
|
||||||
|
the uffd. You must provide either UFFDIO_COPY or UFFDIO_ZEROPAGE.
|
||||||
|
The normal behavior of the OS automatically providing a zero page on
|
||||||
|
an annonymous mmaping is not in place.
|
||||||
|
|
||||||
|
- None of the page-delivering ioctls default to the range that you
|
||||||
|
registered with. You must fill in all fields for the appropriate
|
||||||
|
ioctl struct including the range.
|
||||||
|
|
||||||
|
- You get the address of the access that triggered the missing page
|
||||||
|
event out of a struct uffd_msg that you read in the thread from the
|
||||||
|
uffd. You can supply as many pages as you want with UFFDIO_COPY or
|
||||||
|
UFFDIO_ZEROPAGE. Keep in mind that unless you used DONTWAKE then
|
||||||
|
the first of any of those IOCTLs wakes up the faulting thread.
|
||||||
|
|
||||||
|
- Be sure to test for all errors including (pollfd[0].revents &
|
||||||
|
POLLERR). This can happen, e.g. when ranges supplied were
|
||||||
|
incorrect.
|
||||||
|
|
||||||
|
Write Protect Notifications
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
This is equivalent to (but faster than) using mprotect and a SIGSEGV
|
||||||
|
signal handler.
|
||||||
|
|
||||||
|
Firstly you need to register a range with UFFDIO_REGISTER_MODE_WP.
|
||||||
|
Instead of using mprotect(2) you use ioctl(uffd, UFFDIO_WRITEPROTECT,
|
||||||
|
struct *uffdio_writeprotect) while mode = UFFDIO_WRITEPROTECT_MODE_WP
|
||||||
|
in the struct passed in. The range does not default to and does not
|
||||||
|
have to be identical to the range you registered with. You can write
|
||||||
|
protect as many ranges as you like (inside the registered range).
|
||||||
|
Then, in the thread reading from uffd the struct will have
|
||||||
|
msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP set. Now you send
|
||||||
|
ioctl(uffd, UFFDIO_WRITEPROTECT, struct *uffdio_writeprotect) again
|
||||||
|
while pagefault.mode does not have UFFDIO_WRITEPROTECT_MODE_WP set.
|
||||||
|
This wakes up the thread which will continue to run with writes. This
|
||||||
|
allows you to do the bookkeeping about the write in the uffd reading
|
||||||
|
thread before the ioctl.
|
||||||
|
|
||||||
|
If you registered with both UFFDIO_REGISTER_MODE_MISSING and
|
||||||
|
UFFDIO_REGISTER_MODE_WP then you need to think about the sequence in
|
||||||
|
which you supply a page and undo write protect. Note that there is a
|
||||||
|
difference between writes into a WP area and into a !WP area. The
|
||||||
|
former will have UFFD_PAGEFAULT_FLAG_WP set, the latter
|
||||||
|
UFFD_PAGEFAULT_FLAG_WRITE. The latter did not fail on protection but
|
||||||
|
you still need to supply a page when UFFDIO_REGISTER_MODE_MISSING was
|
||||||
|
used.
|
||||||
|
|
||||||
QEMU/KVM
|
QEMU/KVM
|
||||||
========
|
========
|
||||||
|
|
||||||
|
|
|
@ -43,7 +43,8 @@ value 1 for supported.
|
||||||
|
|
||||||
AXI_ID and AXI_MASKING are mapped on DPCR1 register in performance counter.
|
AXI_ID and AXI_MASKING are mapped on DPCR1 register in performance counter.
|
||||||
When non-masked bits are matching corresponding AXI_ID bits then counter is
|
When non-masked bits are matching corresponding AXI_ID bits then counter is
|
||||||
incremented. Perf counter is incremented if
|
incremented. Perf counter is incremented if::
|
||||||
|
|
||||||
AxID && AXI_MASKING == AXI_ID && AXI_MASKING
|
AxID && AXI_MASKING == AXI_ID && AXI_MASKING
|
||||||
|
|
||||||
This filter doesn't support filter different AXI ID for axid-read and axid-write
|
This filter doesn't support filter different AXI ID for axid-read and axid-write
|
||||||
|
|
274
Documentation/admin-guide/pm/cpufreq_drivers.rst
Normal file
274
Documentation/admin-guide/pm/cpufreq_drivers.rst
Normal file
|
@ -0,0 +1,274 @@
|
||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
=======================================================
|
||||||
|
Legacy Documentation of CPU Performance Scaling Drivers
|
||||||
|
=======================================================
|
||||||
|
|
||||||
|
Included below are historic documents describing assorted
|
||||||
|
:doc:`CPU performance scaling <cpufreq>` drivers. They are reproduced verbatim,
|
||||||
|
with the original white space formatting and indentation preserved, except for
|
||||||
|
the added leading space character in every line of text.
|
||||||
|
|
||||||
|
|
||||||
|
AMD PowerNow! Drivers
|
||||||
|
=====================
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
PowerNow! and Cool'n'Quiet are AMD names for frequency
|
||||||
|
management capabilities in AMD processors. As the hardware
|
||||||
|
implementation changes in new generations of the processors,
|
||||||
|
there is a different cpu-freq driver for each generation.
|
||||||
|
|
||||||
|
Note that the driver's will not load on the "wrong" hardware,
|
||||||
|
so it is safe to try each driver in turn when in doubt as to
|
||||||
|
which is the correct driver.
|
||||||
|
|
||||||
|
Note that the functionality to change frequency (and voltage)
|
||||||
|
is not available in all processors. The drivers will refuse
|
||||||
|
to load on processors without this capability. The capability
|
||||||
|
is detected with the cpuid instruction.
|
||||||
|
|
||||||
|
The drivers use BIOS supplied tables to obtain frequency and
|
||||||
|
voltage information appropriate for a particular platform.
|
||||||
|
Frequency transitions will be unavailable if the BIOS does
|
||||||
|
not supply these tables.
|
||||||
|
|
||||||
|
6th Generation: powernow-k6
|
||||||
|
|
||||||
|
7th Generation: powernow-k7: Athlon, Duron, Geode.
|
||||||
|
|
||||||
|
8th Generation: powernow-k8: Athlon, Athlon 64, Opteron, Sempron.
|
||||||
|
Documentation on this functionality in 8th generation processors
|
||||||
|
is available in the "BIOS and Kernel Developer's Guide", publication
|
||||||
|
26094, in chapter 9, available for download from www.amd.com.
|
||||||
|
|
||||||
|
BIOS supplied data, for powernow-k7 and for powernow-k8, may be
|
||||||
|
from either the PSB table or from ACPI objects. The ACPI support
|
||||||
|
is only available if the kernel config sets CONFIG_ACPI_PROCESSOR.
|
||||||
|
The powernow-k8 driver will attempt to use ACPI if so configured,
|
||||||
|
and fall back to PST if that fails.
|
||||||
|
The powernow-k7 driver will try to use the PSB support first, and
|
||||||
|
fall back to ACPI if the PSB support fails. A module parameter,
|
||||||
|
acpi_force, is provided to force ACPI support to be used instead
|
||||||
|
of PSB support.
|
||||||
|
|
||||||
|
|
||||||
|
``cpufreq-nforce2``
|
||||||
|
===================
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
The cpufreq-nforce2 driver changes the FSB on nVidia nForce2 platforms.
|
||||||
|
|
||||||
|
This works better than on other platforms, because the FSB of the CPU
|
||||||
|
can be controlled independently from the PCI/AGP clock.
|
||||||
|
|
||||||
|
The module has two options:
|
||||||
|
|
||||||
|
fid: multiplier * 10 (for example 8.5 = 85)
|
||||||
|
min_fsb: minimum FSB
|
||||||
|
|
||||||
|
If not set, fid is calculated from the current CPU speed and the FSB.
|
||||||
|
min_fsb defaults to FSB at boot time - 50 MHz.
|
||||||
|
|
||||||
|
IMPORTANT: The available range is limited downwards!
|
||||||
|
Also the minimum available FSB can differ, for systems
|
||||||
|
booting with 200 MHz, 150 should always work.
|
||||||
|
|
||||||
|
|
||||||
|
``pcc-cpufreq``
|
||||||
|
===============
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
/*
|
||||||
|
* pcc-cpufreq.txt - PCC interface documentation
|
||||||
|
*
|
||||||
|
* Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
|
||||||
|
* Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
|
||||||
|
* Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
Processor Clocking Control Driver
|
||||||
|
---------------------------------
|
||||||
|
|
||||||
|
Contents:
|
||||||
|
---------
|
||||||
|
1. Introduction
|
||||||
|
1.1 PCC interface
|
||||||
|
1.1.1 Get Average Frequency
|
||||||
|
1.1.2 Set Desired Frequency
|
||||||
|
1.2 Platforms affected
|
||||||
|
2. Driver and /sys details
|
||||||
|
2.1 scaling_available_frequencies
|
||||||
|
2.2 cpuinfo_transition_latency
|
||||||
|
2.3 cpuinfo_cur_freq
|
||||||
|
2.4 related_cpus
|
||||||
|
3. Caveats
|
||||||
|
|
||||||
|
1. Introduction:
|
||||||
|
----------------
|
||||||
|
Processor Clocking Control (PCC) is an interface between the platform
|
||||||
|
firmware and OSPM. It is a mechanism for coordinating processor
|
||||||
|
performance (ie: frequency) between the platform firmware and the OS.
|
||||||
|
|
||||||
|
The PCC driver (pcc-cpufreq) allows OSPM to take advantage of the PCC
|
||||||
|
interface.
|
||||||
|
|
||||||
|
OS utilizes the PCC interface to inform platform firmware what frequency the
|
||||||
|
OS wants for a logical processor. The platform firmware attempts to achieve
|
||||||
|
the requested frequency. If the request for the target frequency could not be
|
||||||
|
satisfied by platform firmware, then it usually means that power budget
|
||||||
|
conditions are in place, and "power capping" is taking place.
|
||||||
|
|
||||||
|
1.1 PCC interface:
|
||||||
|
------------------
|
||||||
|
The complete PCC specification is available here:
|
||||||
|
https://acpica.org/sites/acpica/files/Processor-Clocking-Control-v1p0.pdf
|
||||||
|
|
||||||
|
PCC relies on a shared memory region that provides a channel for communication
|
||||||
|
between the OS and platform firmware. PCC also implements a "doorbell" that
|
||||||
|
is used by the OS to inform the platform firmware that a command has been
|
||||||
|
sent.
|
||||||
|
|
||||||
|
The ACPI PCCH() method is used to discover the location of the PCC shared
|
||||||
|
memory region. The shared memory region header contains the "command" and
|
||||||
|
"status" interface. PCCH() also contains details on how to access the platform
|
||||||
|
doorbell.
|
||||||
|
|
||||||
|
The following commands are supported by the PCC interface:
|
||||||
|
* Get Average Frequency
|
||||||
|
* Set Desired Frequency
|
||||||
|
|
||||||
|
The ACPI PCCP() method is implemented for each logical processor and is
|
||||||
|
used to discover the offsets for the input and output buffers in the shared
|
||||||
|
memory region.
|
||||||
|
|
||||||
|
When PCC mode is enabled, the platform will not expose processor performance
|
||||||
|
or throttle states (_PSS, _TSS and related ACPI objects) to OSPM. Therefore,
|
||||||
|
the native P-state driver (such as acpi-cpufreq for Intel, powernow-k8 for
|
||||||
|
AMD) will not load.
|
||||||
|
|
||||||
|
However, OSPM remains in control of policy. The governor (eg: "ondemand")
|
||||||
|
computes the required performance for each processor based on server workload.
|
||||||
|
The PCC driver fills in the command interface, and the input buffer and
|
||||||
|
communicates the request to the platform firmware. The platform firmware is
|
||||||
|
responsible for delivering the requested performance.
|
||||||
|
|
||||||
|
Each PCC command is "global" in scope and can affect all the logical CPUs in
|
||||||
|
the system. Therefore, PCC is capable of performing "group" updates. With PCC
|
||||||
|
the OS is capable of getting/setting the frequency of all the logical CPUs in
|
||||||
|
the system with a single call to the BIOS.
|
||||||
|
|
||||||
|
1.1.1 Get Average Frequency:
|
||||||
|
----------------------------
|
||||||
|
This command is used by the OSPM to query the running frequency of the
|
||||||
|
processor since the last time this command was completed. The output buffer
|
||||||
|
indicates the average unhalted frequency of the logical processor expressed as
|
||||||
|
a percentage of the nominal (ie: maximum) CPU frequency. The output buffer
|
||||||
|
also signifies if the CPU frequency is limited by a power budget condition.
|
||||||
|
|
||||||
|
1.1.2 Set Desired Frequency:
|
||||||
|
----------------------------
|
||||||
|
This command is used by the OSPM to communicate to the platform firmware the
|
||||||
|
desired frequency for a logical processor. The output buffer is currently
|
||||||
|
ignored by OSPM. The next invocation of "Get Average Frequency" will inform
|
||||||
|
OSPM if the desired frequency was achieved or not.
|
||||||
|
|
||||||
|
1.2 Platforms affected:
|
||||||
|
-----------------------
|
||||||
|
The PCC driver will load on any system where the platform firmware:
|
||||||
|
* supports the PCC interface, and the associated PCCH() and PCCP() methods
|
||||||
|
* assumes responsibility for managing the hardware clocking controls in order
|
||||||
|
to deliver the requested processor performance
|
||||||
|
|
||||||
|
Currently, certain HP ProLiant platforms implement the PCC interface. On those
|
||||||
|
platforms PCC is the "default" choice.
|
||||||
|
|
||||||
|
However, it is possible to disable this interface via a BIOS setting. In
|
||||||
|
such an instance, as is also the case on platforms where the PCC interface
|
||||||
|
is not implemented, the PCC driver will fail to load silently.
|
||||||
|
|
||||||
|
2. Driver and /sys details:
|
||||||
|
---------------------------
|
||||||
|
When the driver loads, it merely prints the lowest and the highest CPU
|
||||||
|
frequencies supported by the platform firmware.
|
||||||
|
|
||||||
|
The PCC driver loads with a message such as:
|
||||||
|
pcc-cpufreq: (v1.00.00) driver loaded with frequency limits: 1600 MHz, 2933
|
||||||
|
MHz
|
||||||
|
|
||||||
|
This means that the OPSM can request the CPU to run at any frequency in
|
||||||
|
between the limits (1600 MHz, and 2933 MHz) specified in the message.
|
||||||
|
|
||||||
|
Internally, there is no need for the driver to convert the "target" frequency
|
||||||
|
to a corresponding P-state.
|
||||||
|
|
||||||
|
The VERSION number for the driver will be of the format v.xy.ab.
|
||||||
|
eg: 1.00.02
|
||||||
|
----- --
|
||||||
|
| |
|
||||||
|
| -- this will increase with bug fixes/enhancements to the driver
|
||||||
|
|-- this is the version of the PCC specification the driver adheres to
|
||||||
|
|
||||||
|
|
||||||
|
The following is a brief discussion on some of the fields exported via the
|
||||||
|
/sys filesystem and how their values are affected by the PCC driver:
|
||||||
|
|
||||||
|
2.1 scaling_available_frequencies:
|
||||||
|
----------------------------------
|
||||||
|
scaling_available_frequencies is not created in /sys. No intermediate
|
||||||
|
frequencies need to be listed because the BIOS will try to achieve any
|
||||||
|
frequency, within limits, requested by the governor. A frequency does not have
|
||||||
|
to be strictly associated with a P-state.
|
||||||
|
|
||||||
|
2.2 cpuinfo_transition_latency:
|
||||||
|
-------------------------------
|
||||||
|
The cpuinfo_transition_latency field is 0. The PCC specification does
|
||||||
|
not include a field to expose this value currently.
|
||||||
|
|
||||||
|
2.3 cpuinfo_cur_freq:
|
||||||
|
---------------------
|
||||||
|
A) Often cpuinfo_cur_freq will show a value different than what is declared
|
||||||
|
in the scaling_available_frequencies or scaling_cur_freq, or scaling_max_freq.
|
||||||
|
This is due to "turbo boost" available on recent Intel processors. If certain
|
||||||
|
conditions are met the BIOS can achieve a slightly higher speed than requested
|
||||||
|
by OSPM. An example:
|
||||||
|
|
||||||
|
scaling_cur_freq : 2933000
|
||||||
|
cpuinfo_cur_freq : 3196000
|
||||||
|
|
||||||
|
B) There is a round-off error associated with the cpuinfo_cur_freq value.
|
||||||
|
Since the driver obtains the current frequency as a "percentage" (%) of the
|
||||||
|
nominal frequency from the BIOS, sometimes, the values displayed by
|
||||||
|
scaling_cur_freq and cpuinfo_cur_freq may not match. An example:
|
||||||
|
|
||||||
|
scaling_cur_freq : 1600000
|
||||||
|
cpuinfo_cur_freq : 1583000
|
||||||
|
|
||||||
|
In this example, the nominal frequency is 2933 MHz. The driver obtains the
|
||||||
|
current frequency, cpuinfo_cur_freq, as 54% of the nominal frequency:
|
||||||
|
|
||||||
|
54% of 2933 MHz = 1583 MHz
|
||||||
|
|
||||||
|
Nominal frequency is the maximum frequency of the processor, and it usually
|
||||||
|
corresponds to the frequency of the P0 P-state.
|
||||||
|
|
||||||
|
2.4 related_cpus:
|
||||||
|
-----------------
|
||||||
|
The related_cpus field is identical to affected_cpus.
|
||||||
|
|
||||||
|
affected_cpus : 4
|
||||||
|
related_cpus : 4
|
||||||
|
|
||||||
|
Currently, the PCC driver does not evaluate _PSD. The platforms that support
|
||||||
|
PCC do not implement SW_ALL. So OSPM doesn't need to perform any coordination
|
||||||
|
to ensure that the same frequency is requested of all dependent CPUs.
|
||||||
|
|
||||||
|
3. Caveats:
|
||||||
|
-----------
|
||||||
|
The "cpufreq_stats" module in its present form cannot be loaded and
|
||||||
|
expected to work with the PCC driver. Since the "cpufreq_stats" module
|
||||||
|
provides information wrt each P-state, it is not applicable to the PCC driver.
|
|
@ -583,20 +583,17 @@ Power Management Quality of Service for CPUs
|
||||||
The power management quality of service (PM QoS) framework in the Linux kernel
|
The power management quality of service (PM QoS) framework in the Linux kernel
|
||||||
allows kernel code and user space processes to set constraints on various
|
allows kernel code and user space processes to set constraints on various
|
||||||
energy-efficiency features of the kernel to prevent performance from dropping
|
energy-efficiency features of the kernel to prevent performance from dropping
|
||||||
below a required level. The PM QoS constraints can be set globally, in
|
below a required level.
|
||||||
predefined categories referred to as PM QoS classes, or against individual
|
|
||||||
devices.
|
|
||||||
|
|
||||||
CPU idle time management can be affected by PM QoS in two ways, through the
|
CPU idle time management can be affected by PM QoS in two ways, through the
|
||||||
global constraint in the ``PM_QOS_CPU_DMA_LATENCY`` class and through the
|
global CPU latency limit and through the resume latency constraints for
|
||||||
resume latency constraints for individual CPUs. Kernel code (e.g. device
|
individual CPUs. Kernel code (e.g. device drivers) can set both of them with
|
||||||
drivers) can set both of them with the help of special internal interfaces
|
the help of special internal interfaces provided by the PM QoS framework. User
|
||||||
provided by the PM QoS framework. User space can modify the former by opening
|
space can modify the former by opening the :file:`cpu_dma_latency` special
|
||||||
the :file:`cpu_dma_latency` special device file under :file:`/dev/` and writing
|
device file under :file:`/dev/` and writing a binary value (interpreted as a
|
||||||
a binary value (interpreted as a signed 32-bit integer) to it. In turn, the
|
signed 32-bit integer) to it. In turn, the resume latency constraint for a CPU
|
||||||
resume latency constraint for a CPU can be modified by user space by writing a
|
can be modified from user space by writing a string (representing a signed
|
||||||
string (representing a signed 32-bit integer) to the
|
32-bit integer) to the :file:`power/pm_qos_resume_latency_us` file under
|
||||||
:file:`power/pm_qos_resume_latency_us` file under
|
|
||||||
:file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs``, where the CPU number
|
:file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs``, where the CPU number
|
||||||
``<N>`` is allocated at the system initialization time. Negative values
|
``<N>`` is allocated at the system initialization time. Negative values
|
||||||
will be rejected in both cases and, also in both cases, the written integer
|
will be rejected in both cases and, also in both cases, the written integer
|
||||||
|
@ -605,32 +602,34 @@ number will be interpreted as a requested PM QoS constraint in microseconds.
|
||||||
The requested value is not automatically applied as a new constraint, however,
|
The requested value is not automatically applied as a new constraint, however,
|
||||||
as it may be less restrictive (greater in this particular case) than another
|
as it may be less restrictive (greater in this particular case) than another
|
||||||
constraint previously requested by someone else. For this reason, the PM QoS
|
constraint previously requested by someone else. For this reason, the PM QoS
|
||||||
framework maintains a list of requests that have been made so far in each
|
framework maintains a list of requests that have been made so far for the
|
||||||
global class and for each device, aggregates them and applies the effective
|
global CPU latency limit and for each individual CPU, aggregates them and
|
||||||
(minimum in this particular case) value as the new constraint.
|
applies the effective (minimum in this particular case) value as the new
|
||||||
|
constraint.
|
||||||
|
|
||||||
In fact, opening the :file:`cpu_dma_latency` special device file causes a new
|
In fact, opening the :file:`cpu_dma_latency` special device file causes a new
|
||||||
PM QoS request to be created and added to the priority list of requests in the
|
PM QoS request to be created and added to a global priority list of CPU latency
|
||||||
``PM_QOS_CPU_DMA_LATENCY`` class and the file descriptor coming from the
|
limit requests and the file descriptor coming from the "open" operation
|
||||||
"open" operation represents that request. If that file descriptor is then
|
represents that request. If that file descriptor is then used for writing, the
|
||||||
used for writing, the number written to it will be associated with the PM QoS
|
number written to it will be associated with the PM QoS request represented by
|
||||||
request represented by it as a new requested constraint value. Next, the
|
it as a new requested limit value. Next, the priority list mechanism will be
|
||||||
priority list mechanism will be used to determine the new effective value of
|
used to determine the new effective value of the entire list of requests and
|
||||||
the entire list of requests and that effective value will be set as a new
|
that effective value will be set as a new CPU latency limit. Thus requesting a
|
||||||
constraint. Thus setting a new requested constraint value will only change the
|
new limit value will only change the real limit if the effective "list" value is
|
||||||
real constraint if the effective "list" value is affected by it. In particular,
|
affected by it, which is the case if it is the minimum of the requested values
|
||||||
for the ``PM_QOS_CPU_DMA_LATENCY`` class it only affects the real constraint if
|
in the list.
|
||||||
it is the minimum of the requested constraints in the list. The process holding
|
|
||||||
a file descriptor obtained by opening the :file:`cpu_dma_latency` special device
|
The process holding a file descriptor obtained by opening the
|
||||||
file controls the PM QoS request associated with that file descriptor, but it
|
:file:`cpu_dma_latency` special device file controls the PM QoS request
|
||||||
controls this particular PM QoS request only.
|
associated with that file descriptor, but it controls this particular PM QoS
|
||||||
|
request only.
|
||||||
|
|
||||||
Closing the :file:`cpu_dma_latency` special device file or, more precisely, the
|
Closing the :file:`cpu_dma_latency` special device file or, more precisely, the
|
||||||
file descriptor obtained while opening it, causes the PM QoS request associated
|
file descriptor obtained while opening it, causes the PM QoS request associated
|
||||||
with that file descriptor to be removed from the ``PM_QOS_CPU_DMA_LATENCY``
|
with that file descriptor to be removed from the global priority list of CPU
|
||||||
class priority list and destroyed. If that happens, the priority list mechanism
|
latency limit requests and destroyed. If that happens, the priority list
|
||||||
will be used, again, to determine the new effective value for the whole list
|
mechanism will be used again, to determine the new effective value for the whole
|
||||||
and that value will become the new real constraint.
|
list and that value will become the new limit.
|
||||||
|
|
||||||
In turn, for each CPU there is one resume latency PM QoS request associated with
|
In turn, for each CPU there is one resume latency PM QoS request associated with
|
||||||
the :file:`power/pm_qos_resume_latency_us` file under
|
the :file:`power/pm_qos_resume_latency_us` file under
|
||||||
|
@ -647,10 +646,10 @@ CPU in question every time the list of requests is updated this way or another
|
||||||
(there may be other requests coming from kernel code in that list).
|
(there may be other requests coming from kernel code in that list).
|
||||||
|
|
||||||
CPU idle time governors are expected to regard the minimum of the global
|
CPU idle time governors are expected to regard the minimum of the global
|
||||||
effective ``PM_QOS_CPU_DMA_LATENCY`` class constraint and the effective
|
(effective) CPU latency limit and the effective resume latency constraint for
|
||||||
resume latency constraint for the given CPU as the upper limit for the exit
|
the given CPU as the upper limit for the exit latency of the idle states that
|
||||||
latency of the idle states they can select for that CPU. They should never
|
they are allowed to select for that CPU. They should never select any idle
|
||||||
select any idle states with exit latency beyond that limit.
|
states with exit latency beyond that limit.
|
||||||
|
|
||||||
|
|
||||||
Idle States Control Via Kernel Command Line
|
Idle States Control Via Kernel Command Line
|
||||||
|
|
|
@ -734,10 +734,10 @@ References
|
||||||
==========
|
==========
|
||||||
|
|
||||||
.. [1] Kristen Accardi, *Balancing Power and Performance in the Linux Kernel*,
|
.. [1] Kristen Accardi, *Balancing Power and Performance in the Linux Kernel*,
|
||||||
http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
|
https://events.static.linuxfound.org/sites/events/files/slides/LinuxConEurope_2015.pdf
|
||||||
|
|
||||||
.. [2] *Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 3: System Programming Guide*,
|
.. [2] *Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 3: System Programming Guide*,
|
||||||
http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html
|
https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html
|
||||||
|
|
||||||
.. [3] *Advanced Configuration and Power Interface Specification*,
|
.. [3] *Advanced Configuration and Power Interface Specification*,
|
||||||
https://uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf
|
https://uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf
|
||||||
|
|
270
Documentation/admin-guide/pm/suspend-flows.rst
Normal file
270
Documentation/admin-guide/pm/suspend-flows.rst
Normal file
|
@ -0,0 +1,270 @@
|
||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
.. include:: <isonum.txt>
|
||||||
|
|
||||||
|
=========================
|
||||||
|
System Suspend Code Flows
|
||||||
|
=========================
|
||||||
|
|
||||||
|
:Copyright: |copy| 2020 Intel Corporation
|
||||||
|
|
||||||
|
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
|
||||||
|
At least one global system-wide transition needs to be carried out for the
|
||||||
|
system to get from the working state into one of the supported
|
||||||
|
:doc:`sleep states <sleep-states>`. Hibernation requires more than one
|
||||||
|
transition to occur for this purpose, but the other sleep states, commonly
|
||||||
|
referred to as *system-wide suspend* (or simply *system suspend*) states, need
|
||||||
|
only one.
|
||||||
|
|
||||||
|
For those sleep states, the transition from the working state of the system into
|
||||||
|
the target sleep state is referred to as *system suspend* too (in the majority
|
||||||
|
of cases, whether this means a transition or a sleep state of the system should
|
||||||
|
be clear from the context) and the transition back from the sleep state into the
|
||||||
|
working state is referred to as *system resume*.
|
||||||
|
|
||||||
|
The kernel code flows associated with the suspend and resume transitions for
|
||||||
|
different sleep states of the system are quite similar, but there are some
|
||||||
|
significant differences between the :ref:`suspend-to-idle <s2idle>` code flows
|
||||||
|
and the code flows related to the :ref:`suspend-to-RAM <s2ram>` and
|
||||||
|
:ref:`standby <standby>` sleep states.
|
||||||
|
|
||||||
|
The :ref:`suspend-to-RAM <s2ram>` and :ref:`standby <standby>` sleep states
|
||||||
|
cannot be implemented without platform support and the difference between them
|
||||||
|
boils down to the platform-specific actions carried out by the suspend and
|
||||||
|
resume hooks that need to be provided by the platform driver to make them
|
||||||
|
available. Apart from that, the suspend and resume code flows for these sleep
|
||||||
|
states are mostly identical, so they both together will be referred to as
|
||||||
|
*platform-dependent suspend* states in what follows.
|
||||||
|
|
||||||
|
|
||||||
|
.. _s2idle_suspend:
|
||||||
|
|
||||||
|
Suspend-to-idle Suspend Code Flow
|
||||||
|
=================================
|
||||||
|
|
||||||
|
The following steps are taken in order to transition the system from the working
|
||||||
|
state to the :ref:`suspend-to-idle <s2idle>` sleep state:
|
||||||
|
|
||||||
|
1. Invoking system-wide suspend notifiers.
|
||||||
|
|
||||||
|
Kernel subsystems can register callbacks to be invoked when the suspend
|
||||||
|
transition is about to occur and when the resume transition has finished.
|
||||||
|
|
||||||
|
That allows them to prepare for the change of the system state and to clean
|
||||||
|
up after getting back to the working state.
|
||||||
|
|
||||||
|
2. Freezing tasks.
|
||||||
|
|
||||||
|
Tasks are frozen primarily in order to avoid unchecked hardware accesses
|
||||||
|
from user space through MMIO regions or I/O registers exposed directly to
|
||||||
|
it and to prevent user space from entering the kernel while the next step
|
||||||
|
of the transition is in progress (which might have been problematic for
|
||||||
|
various reasons).
|
||||||
|
|
||||||
|
All user space tasks are intercepted as though they were sent a signal and
|
||||||
|
put into uninterruptible sleep until the end of the subsequent system resume
|
||||||
|
transition.
|
||||||
|
|
||||||
|
The kernel threads that choose to be frozen during system suspend for
|
||||||
|
specific reasons are frozen subsequently, but they are not intercepted.
|
||||||
|
Instead, they are expected to periodically check whether or not they need
|
||||||
|
to be frozen and to put themselves into uninterruptible sleep if so. [Note,
|
||||||
|
however, that kernel threads can use locking and other concurrency controls
|
||||||
|
available in kernel space to synchronize themselves with system suspend and
|
||||||
|
resume, which can be much more precise than the freezing, so the latter is
|
||||||
|
not a recommended option for kernel threads.]
|
||||||
|
|
||||||
|
3. Suspending devices and reconfiguring IRQs.
|
||||||
|
|
||||||
|
Devices are suspended in four phases called *prepare*, *suspend*,
|
||||||
|
*late suspend* and *noirq suspend* (see :ref:`driverapi_pm_devices` for more
|
||||||
|
information on what exactly happens in each phase).
|
||||||
|
|
||||||
|
Every device is visited in each phase, but typically it is not physically
|
||||||
|
accessed in more than two of them.
|
||||||
|
|
||||||
|
The runtime PM API is disabled for every device during the *late* suspend
|
||||||
|
phase and high-level ("action") interrupt handlers are prevented from being
|
||||||
|
invoked before the *noirq* suspend phase.
|
||||||
|
|
||||||
|
Interrupts are still handled after that, but they are only acknowledged to
|
||||||
|
interrupt controllers without performing any device-specific actions that
|
||||||
|
would be triggered in the working state of the system (those actions are
|
||||||
|
deferred till the subsequent system resume transition as described
|
||||||
|
`below <s2idle_resume_>`_).
|
||||||
|
|
||||||
|
IRQs associated with system wakeup devices are "armed" so that the resume
|
||||||
|
transition of the system is started when one of them signals an event.
|
||||||
|
|
||||||
|
4. Freezing the scheduler tick and suspending timekeeping.
|
||||||
|
|
||||||
|
When all devices have been suspended, CPUs enter the idle loop and are put
|
||||||
|
into the deepest available idle state. While doing that, each of them
|
||||||
|
"freezes" its own scheduler tick so that the timer events associated with
|
||||||
|
the tick do not occur until the CPU is woken up by another interrupt source.
|
||||||
|
|
||||||
|
The last CPU to enter the idle state also stops the timekeeping which
|
||||||
|
(among other things) prevents high resolution timers from triggering going
|
||||||
|
forward until the first CPU that is woken up restarts the timekeeping.
|
||||||
|
That allows the CPUs to stay in the deep idle state relatively long in one
|
||||||
|
go.
|
||||||
|
|
||||||
|
From this point on, the CPUs can only be woken up by non-timer hardware
|
||||||
|
interrupts. If that happens, they go back to the idle state unless the
|
||||||
|
interrupt that woke up one of them comes from an IRQ that has been armed for
|
||||||
|
system wakeup, in which case the system resume transition is started.
|
||||||
|
|
||||||
|
|
||||||
|
.. _s2idle_resume:
|
||||||
|
|
||||||
|
Suspend-to-idle Resume Code Flow
|
||||||
|
================================
|
||||||
|
|
||||||
|
The following steps are taken in order to transition the system from the
|
||||||
|
:ref:`suspend-to-idle <s2idle>` sleep state into the working state:
|
||||||
|
|
||||||
|
1. Resuming timekeeping and unfreezing the scheduler tick.
|
||||||
|
|
||||||
|
When one of the CPUs is woken up (by a non-timer hardware interrupt), it
|
||||||
|
leaves the idle state entered in the last step of the preceding suspend
|
||||||
|
transition, restarts the timekeeping (unless it has been restarted already
|
||||||
|
by another CPU that woke up earlier) and the scheduler tick on that CPU is
|
||||||
|
unfrozen.
|
||||||
|
|
||||||
|
If the interrupt that has woken up the CPU was armed for system wakeup,
|
||||||
|
the system resume transition begins.
|
||||||
|
|
||||||
|
2. Resuming devices and restoring the working-state configuration of IRQs.
|
||||||
|
|
||||||
|
Devices are resumed in four phases called *noirq resume*, *early resume*,
|
||||||
|
*resume* and *complete* (see :ref:`driverapi_pm_devices` for more
|
||||||
|
information on what exactly happens in each phase).
|
||||||
|
|
||||||
|
Every device is visited in each phase, but typically it is not physically
|
||||||
|
accessed in more than two of them.
|
||||||
|
|
||||||
|
The working-state configuration of IRQs is restored after the *noirq* resume
|
||||||
|
phase and the runtime PM API is re-enabled for every device whose driver
|
||||||
|
supports it during the *early* resume phase.
|
||||||
|
|
||||||
|
3. Thawing tasks.
|
||||||
|
|
||||||
|
Tasks frozen in step 2 of the preceding `suspend <s2idle_suspend_>`_
|
||||||
|
transition are "thawed", which means that they are woken up from the
|
||||||
|
uninterruptible sleep that they went into at that time and user space tasks
|
||||||
|
are allowed to exit the kernel.
|
||||||
|
|
||||||
|
4. Invoking system-wide resume notifiers.
|
||||||
|
|
||||||
|
This is analogous to step 1 of the `suspend <s2idle_suspend_>`_ transition
|
||||||
|
and the same set of callbacks is invoked at this point, but a different
|
||||||
|
"notification type" parameter value is passed to them.
|
||||||
|
|
||||||
|
|
||||||
|
Platform-dependent Suspend Code Flow
|
||||||
|
====================================
|
||||||
|
|
||||||
|
The following steps are taken in order to transition the system from the working
|
||||||
|
state to platform-dependent suspend state:
|
||||||
|
|
||||||
|
1. Invoking system-wide suspend notifiers.
|
||||||
|
|
||||||
|
This step is the same as step 1 of the suspend-to-idle suspend transition
|
||||||
|
described `above <s2idle_suspend_>`_.
|
||||||
|
|
||||||
|
2. Freezing tasks.
|
||||||
|
|
||||||
|
This step is the same as step 2 of the suspend-to-idle suspend transition
|
||||||
|
described `above <s2idle_suspend_>`_.
|
||||||
|
|
||||||
|
3. Suspending devices and reconfiguring IRQs.
|
||||||
|
|
||||||
|
This step is analogous to step 3 of the suspend-to-idle suspend transition
|
||||||
|
described `above <s2idle_suspend_>`_, but the arming of IRQs for system
|
||||||
|
wakeup generally does not have any effect on the platform.
|
||||||
|
|
||||||
|
There are platforms that can go into a very deep low-power state internally
|
||||||
|
when all CPUs in them are in sufficiently deep idle states and all I/O
|
||||||
|
devices have been put into low-power states. On those platforms,
|
||||||
|
suspend-to-idle can reduce system power very effectively.
|
||||||
|
|
||||||
|
On the other platforms, however, low-level components (like interrupt
|
||||||
|
controllers) need to be turned off in a platform-specific way (implemented
|
||||||
|
in the hooks provided by the platform driver) to achieve comparable power
|
||||||
|
reduction.
|
||||||
|
|
||||||
|
That usually prevents in-band hardware interrupts from waking up the system,
|
||||||
|
which must be done in a special platform-dependent way. Then, the
|
||||||
|
configuration of system wakeup sources usually starts when system wakeup
|
||||||
|
devices are suspended and is finalized by the platform suspend hooks later
|
||||||
|
on.
|
||||||
|
|
||||||
|
4. Disabling non-boot CPUs.
|
||||||
|
|
||||||
|
On some platforms the suspend hooks mentioned above must run in a one-CPU
|
||||||
|
configuration of the system (in particular, the hardware cannot be accessed
|
||||||
|
by any code running in parallel with the platform suspend hooks that may,
|
||||||
|
and often do, trap into the platform firmware in order to finalize the
|
||||||
|
suspend transition).
|
||||||
|
|
||||||
|
For this reason, the CPU offline/online (CPU hotplug) framework is used
|
||||||
|
to take all of the CPUs in the system, except for one (the boot CPU),
|
||||||
|
offline (typically, the CPUs that have been taken offline go into deep idle
|
||||||
|
states).
|
||||||
|
|
||||||
|
This means that all tasks are migrated away from those CPUs and all IRQs are
|
||||||
|
rerouted to the only CPU that remains online.
|
||||||
|
|
||||||
|
5. Suspending core system components.
|
||||||
|
|
||||||
|
This prepares the core system components for (possibly) losing power going
|
||||||
|
forward and suspends the timekeeping.
|
||||||
|
|
||||||
|
6. Platform-specific power removal.
|
||||||
|
|
||||||
|
This is expected to remove power from all of the system components except
|
||||||
|
for the memory controller and RAM (in order to preserve the contents of the
|
||||||
|
latter) and some devices designated for system wakeup.
|
||||||
|
|
||||||
|
In many cases control is passed to the platform firmware which is expected
|
||||||
|
to finalize the suspend transition as needed.
|
||||||
|
|
||||||
|
|
||||||
|
Platform-dependent Resume Code Flow
|
||||||
|
===================================
|
||||||
|
|
||||||
|
The following steps are taken in order to transition the system from a
|
||||||
|
platform-dependent suspend state into the working state:
|
||||||
|
|
||||||
|
1. Platform-specific system wakeup.
|
||||||
|
|
||||||
|
The platform is woken up by a signal from one of the designated system
|
||||||
|
wakeup devices (which need not be an in-band hardware interrupt) and
|
||||||
|
control is passed back to the kernel (the working configuration of the
|
||||||
|
platform may need to be restored by the platform firmware before the
|
||||||
|
kernel gets control again).
|
||||||
|
|
||||||
|
2. Resuming core system components.
|
||||||
|
|
||||||
|
The suspend-time configuration of the core system components is restored and
|
||||||
|
the timekeeping is resumed.
|
||||||
|
|
||||||
|
3. Re-enabling non-boot CPUs.
|
||||||
|
|
||||||
|
The CPUs disabled in step 4 of the preceding suspend transition are taken
|
||||||
|
back online and their suspend-time configuration is restored.
|
||||||
|
|
||||||
|
4. Resuming devices and restoring the working-state configuration of IRQs.
|
||||||
|
|
||||||
|
This step is the same as step 2 of the suspend-to-idle suspend transition
|
||||||
|
described `above <s2idle_resume_>`_.
|
||||||
|
|
||||||
|
5. Thawing tasks.
|
||||||
|
|
||||||
|
This step is the same as step 3 of the suspend-to-idle suspend transition
|
||||||
|
described `above <s2idle_resume_>`_.
|
||||||
|
|
||||||
|
6. Invoking system-wide resume notifiers.
|
||||||
|
|
||||||
|
This step is the same as step 4 of the suspend-to-idle suspend transition
|
||||||
|
described `above <s2idle_resume_>`_.
|
|
@ -8,3 +8,4 @@ System-Wide Power Management
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
|
|
||||||
sleep-states
|
sleep-states
|
||||||
|
suspend-flows
|
||||||
|
|
|
@ -11,4 +11,5 @@ Working-State Power Management
|
||||||
intel_idle
|
intel_idle
|
||||||
cpufreq
|
cpufreq
|
||||||
intel_pstate
|
intel_pstate
|
||||||
|
cpufreq_drivers
|
||||||
intel_epb
|
intel_epb
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -67,7 +67,8 @@ two flavors of JITs, the newer eBPF JIT currently supported on:
|
||||||
- sparc64
|
- sparc64
|
||||||
- mips64
|
- mips64
|
||||||
- s390x
|
- s390x
|
||||||
- riscv
|
- riscv64
|
||||||
|
- riscv32
|
||||||
|
|
||||||
And the older cBPF JIT supported on the following archs:
|
And the older cBPF JIT supported on the following archs:
|
||||||
|
|
||||||
|
|
|
@ -65,6 +65,12 @@ max_pid_namespaces
|
||||||
The maximum number of pid namespaces that any user in the current
|
The maximum number of pid namespaces that any user in the current
|
||||||
user namespace may create.
|
user namespace may create.
|
||||||
|
|
||||||
|
max_time_namespaces
|
||||||
|
===================
|
||||||
|
|
||||||
|
The maximum number of time namespaces that any user in the current
|
||||||
|
user namespace may create.
|
||||||
|
|
||||||
max_user_namespaces
|
max_user_namespaces
|
||||||
===================
|
===================
|
||||||
|
|
||||||
|
|
|
@ -128,6 +128,9 @@ allowed to examine the unevictable lru (mlocked pages) for pages to compact.
|
||||||
This should be used on systems where stalls for minor page faults are an
|
This should be used on systems where stalls for minor page faults are an
|
||||||
acceptable trade for large contiguous free memory. Set to 0 to prevent
|
acceptable trade for large contiguous free memory. Set to 0 to prevent
|
||||||
compaction from moving pages that are unevictable. Default value is 1.
|
compaction from moving pages that are unevictable. Default value is 1.
|
||||||
|
On CONFIG_PREEMPT_RT the default value is 0 in order to avoid a page fault, due
|
||||||
|
to compaction, which would block the task from becomming active until the fault
|
||||||
|
is resolved.
|
||||||
|
|
||||||
|
|
||||||
dirty_background_bytes
|
dirty_background_bytes
|
||||||
|
|
|
@ -48,9 +48,10 @@ always allowed (by a user with admin privileges).
|
||||||
How do I use the magic SysRq key?
|
How do I use the magic SysRq key?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
On x86 - You press the key combo :kbd:`ALT-SysRq-<command key>`.
|
On x86
|
||||||
|
You press the key combo :kbd:`ALT-SysRq-<command key>`.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
Some
|
Some
|
||||||
keyboards may not have a key labeled 'SysRq'. The 'SysRq' key is
|
keyboards may not have a key labeled 'SysRq'. The 'SysRq' key is
|
||||||
also known as the 'Print Screen' key. Also some keyboards cannot
|
also known as the 'Print Screen' key. Also some keyboards cannot
|
||||||
|
@ -58,14 +59,15 @@ On x86 - You press the key combo :kbd:`ALT-SysRq-<command key>`.
|
||||||
have better luck with press :kbd:`Alt`, press :kbd:`SysRq`,
|
have better luck with press :kbd:`Alt`, press :kbd:`SysRq`,
|
||||||
release :kbd:`SysRq`, press :kbd:`<command key>`, release everything.
|
release :kbd:`SysRq`, press :kbd:`<command key>`, release everything.
|
||||||
|
|
||||||
On SPARC - You press :kbd:`ALT-STOP-<command key>`, I believe.
|
On SPARC
|
||||||
|
You press :kbd:`ALT-STOP-<command key>`, I believe.
|
||||||
|
|
||||||
On the serial console (PC style standard serial ports only)
|
On the serial console (PC style standard serial ports only)
|
||||||
You send a ``BREAK``, then within 5 seconds a command key. Sending
|
You send a ``BREAK``, then within 5 seconds a command key. Sending
|
||||||
``BREAK`` twice is interpreted as a normal BREAK.
|
``BREAK`` twice is interpreted as a normal BREAK.
|
||||||
|
|
||||||
On PowerPC
|
On PowerPC
|
||||||
Press :kbd:`ALT - Print Screen` (or :kbd:`F13`) - :kbd:`<command key>`,
|
Press :kbd:`ALT - Print Screen` (or :kbd:`F13`) - :kbd:`<command key>`.
|
||||||
:kbd:`Print Screen` (or :kbd:`F13`) - :kbd:`<command key>` may suffice.
|
:kbd:`Print Screen` (or :kbd:`F13`) - :kbd:`<command key>` may suffice.
|
||||||
|
|
||||||
On other
|
On other
|
||||||
|
@ -73,7 +75,7 @@ On other
|
||||||
let me know so I can add them to this section.
|
let me know so I can add them to this section.
|
||||||
|
|
||||||
On all
|
On all
|
||||||
write a character to /proc/sysrq-trigger. e.g.::
|
Write a character to /proc/sysrq-trigger. e.g.::
|
||||||
|
|
||||||
echo t > /proc/sysrq-trigger
|
echo t > /proc/sysrq-trigger
|
||||||
|
|
||||||
|
@ -282,7 +284,7 @@ Just ask them on the linux-kernel mailing list:
|
||||||
Credits
|
Credits
|
||||||
~~~~~~~
|
~~~~~~~
|
||||||
|
|
||||||
Written by Mydraal <vulpyne@vulpyne.net>
|
- Written by Mydraal <vulpyne@vulpyne.net>
|
||||||
Updated by Adam Sulmicki <adam@cfar.umd.edu>
|
- Updated by Adam Sulmicki <adam@cfar.umd.edu>
|
||||||
Updated by Jeremy M. Dolan <jmd@turbogeek.org> 2001/01/28 10:15:59
|
- Updated by Jeremy M. Dolan <jmd@turbogeek.org> 2001/01/28 10:15:59
|
||||||
Added to by Crutcher Dunnavant <crutcher+kernel@datastacks.com>
|
- Added to by Crutcher Dunnavant <crutcher+kernel@datastacks.com>
|
||||||
|
|
|
@ -4,18 +4,18 @@ ARM TCM (Tightly-Coupled Memory) handling in Linux
|
||||||
|
|
||||||
Written by Linus Walleij <linus.walleij@stericsson.com>
|
Written by Linus Walleij <linus.walleij@stericsson.com>
|
||||||
|
|
||||||
Some ARM SoC:s have a so-called TCM (Tightly-Coupled Memory).
|
Some ARM SoCs have a so-called TCM (Tightly-Coupled Memory).
|
||||||
This is usually just a few (4-64) KiB of RAM inside the ARM
|
This is usually just a few (4-64) KiB of RAM inside the ARM
|
||||||
processor.
|
processor.
|
||||||
|
|
||||||
Due to being embedded inside the CPU The TCM has a
|
Due to being embedded inside the CPU, the TCM has a
|
||||||
Harvard-architecture, so there is an ITCM (instruction TCM)
|
Harvard-architecture, so there is an ITCM (instruction TCM)
|
||||||
and a DTCM (data TCM). The DTCM can not contain any
|
and a DTCM (data TCM). The DTCM can not contain any
|
||||||
instructions, but the ITCM can actually contain data.
|
instructions, but the ITCM can actually contain data.
|
||||||
The size of DTCM or ITCM is minimum 4KiB so the typical
|
The size of DTCM or ITCM is minimum 4KiB so the typical
|
||||||
minimum configuration is 4KiB ITCM and 4KiB DTCM.
|
minimum configuration is 4KiB ITCM and 4KiB DTCM.
|
||||||
|
|
||||||
ARM CPU:s have special registers to read out status, physical
|
ARM CPUs have special registers to read out status, physical
|
||||||
location and size of TCM memories. arch/arm/include/asm/cputype.h
|
location and size of TCM memories. arch/arm/include/asm/cputype.h
|
||||||
defines a CPUID_TCM register that you can read out from the
|
defines a CPUID_TCM register that you can read out from the
|
||||||
system control coprocessor. Documentation from ARM can be found
|
system control coprocessor. Documentation from ARM can be found
|
||||||
|
|
112
Documentation/arm64/amu.rst
Normal file
112
Documentation/arm64/amu.rst
Normal file
|
@ -0,0 +1,112 @@
|
||||||
|
=======================================================
|
||||||
|
Activity Monitors Unit (AMU) extension in AArch64 Linux
|
||||||
|
=======================================================
|
||||||
|
|
||||||
|
Author: Ionela Voinescu <ionela.voinescu@arm.com>
|
||||||
|
|
||||||
|
Date: 2019-09-10
|
||||||
|
|
||||||
|
This document briefly describes the provision of Activity Monitors Unit
|
||||||
|
support in AArch64 Linux.
|
||||||
|
|
||||||
|
|
||||||
|
Architecture overview
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
The activity monitors extension is an optional extension introduced by the
|
||||||
|
ARMv8.4 CPU architecture.
|
||||||
|
|
||||||
|
The activity monitors unit, implemented in each CPU, provides performance
|
||||||
|
counters intended for system management use. The AMU extension provides a
|
||||||
|
system register interface to the counter registers and also supports an
|
||||||
|
optional external memory-mapped interface.
|
||||||
|
|
||||||
|
Version 1 of the Activity Monitors architecture implements a counter group
|
||||||
|
of four fixed and architecturally defined 64-bit event counters.
|
||||||
|
- CPU cycle counter: increments at the frequency of the CPU.
|
||||||
|
- Constant counter: increments at the fixed frequency of the system
|
||||||
|
clock.
|
||||||
|
- Instructions retired: increments with every architecturally executed
|
||||||
|
instruction.
|
||||||
|
- Memory stall cycles: counts instruction dispatch stall cycles caused by
|
||||||
|
misses in the last level cache within the clock domain.
|
||||||
|
|
||||||
|
When in WFI or WFE these counters do not increment.
|
||||||
|
|
||||||
|
The Activity Monitors architecture provides space for up to 16 architected
|
||||||
|
event counters. Future versions of the architecture may use this space to
|
||||||
|
implement additional architected event counters.
|
||||||
|
|
||||||
|
Additionally, version 1 implements a counter group of up to 16 auxiliary
|
||||||
|
64-bit event counters.
|
||||||
|
|
||||||
|
On cold reset all counters reset to 0.
|
||||||
|
|
||||||
|
|
||||||
|
Basic support
|
||||||
|
-------------
|
||||||
|
|
||||||
|
The kernel can safely run a mix of CPUs with and without support for the
|
||||||
|
activity monitors extension. Therefore, when CONFIG_ARM64_AMU_EXTN is
|
||||||
|
selected we unconditionally enable the capability to allow any late CPU
|
||||||
|
(secondary or hotplugged) to detect and use the feature.
|
||||||
|
|
||||||
|
When the feature is detected on a CPU, we flag the availability of the
|
||||||
|
feature but this does not guarantee the correct functionality of the
|
||||||
|
counters, only the presence of the extension.
|
||||||
|
|
||||||
|
Firmware (code running at higher exception levels, e.g. arm-tf) support is
|
||||||
|
needed to:
|
||||||
|
- Enable access for lower exception levels (EL2 and EL1) to the AMU
|
||||||
|
registers.
|
||||||
|
- Enable the counters. If not enabled these will read as 0.
|
||||||
|
- Save/restore the counters before/after the CPU is being put/brought up
|
||||||
|
from the 'off' power state.
|
||||||
|
|
||||||
|
When using kernels that have this feature enabled but boot with broken
|
||||||
|
firmware the user may experience panics or lockups when accessing the
|
||||||
|
counter registers. Even if these symptoms are not observed, the values
|
||||||
|
returned by the register reads might not correctly reflect reality. Most
|
||||||
|
commonly, the counters will read as 0, indicating that they are not
|
||||||
|
enabled.
|
||||||
|
|
||||||
|
If proper support is not provided in firmware it's best to disable
|
||||||
|
CONFIG_ARM64_AMU_EXTN. To be noted that for security reasons, this does not
|
||||||
|
bypass the setting of AMUSERENR_EL0 to trap accesses from EL0 (userspace) to
|
||||||
|
EL1 (kernel). Therefore, firmware should still ensure accesses to AMU registers
|
||||||
|
are not trapped in EL2/EL3.
|
||||||
|
|
||||||
|
The fixed counters of AMUv1 are accessible though the following system
|
||||||
|
register definitions:
|
||||||
|
- SYS_AMEVCNTR0_CORE_EL0
|
||||||
|
- SYS_AMEVCNTR0_CONST_EL0
|
||||||
|
- SYS_AMEVCNTR0_INST_RET_EL0
|
||||||
|
- SYS_AMEVCNTR0_MEM_STALL_EL0
|
||||||
|
|
||||||
|
Auxiliary platform specific counters can be accessed using
|
||||||
|
SYS_AMEVCNTR1_EL0(n), where n is a value between 0 and 15.
|
||||||
|
|
||||||
|
Details can be found in: arch/arm64/include/asm/sysreg.h.
|
||||||
|
|
||||||
|
|
||||||
|
Userspace access
|
||||||
|
----------------
|
||||||
|
|
||||||
|
Currently, access from userspace to the AMU registers is disabled due to:
|
||||||
|
- Security reasons: they might expose information about code executed in
|
||||||
|
secure mode.
|
||||||
|
- Purpose: AMU counters are intended for system management use.
|
||||||
|
|
||||||
|
Also, the presence of the feature is not visible to userspace.
|
||||||
|
|
||||||
|
|
||||||
|
Virtualization
|
||||||
|
--------------
|
||||||
|
|
||||||
|
Currently, access from userspace (EL0) and kernelspace (EL1) on the KVM
|
||||||
|
guest side is disabled due to:
|
||||||
|
- Security reasons: they might expose information about code executed
|
||||||
|
by other guests or the host.
|
||||||
|
|
||||||
|
Any attempt to access the AMU registers will result in an UNDEFINED
|
||||||
|
exception being injected into the guest.
|
|
@ -248,6 +248,20 @@ Before jumping into the kernel, the following conditions must be met:
|
||||||
- HCR_EL2.APK (bit 40) must be initialised to 0b1
|
- HCR_EL2.APK (bit 40) must be initialised to 0b1
|
||||||
- HCR_EL2.API (bit 41) must be initialised to 0b1
|
- HCR_EL2.API (bit 41) must be initialised to 0b1
|
||||||
|
|
||||||
|
For CPUs with Activity Monitors Unit v1 (AMUv1) extension present:
|
||||||
|
- If EL3 is present:
|
||||||
|
CPTR_EL3.TAM (bit 30) must be initialised to 0b0
|
||||||
|
CPTR_EL2.TAM (bit 30) must be initialised to 0b0
|
||||||
|
AMCNTENSET0_EL0 must be initialised to 0b1111
|
||||||
|
AMCNTENSET1_EL0 must be initialised to a platform specific value
|
||||||
|
having 0b1 set for the corresponding bit for each of the auxiliary
|
||||||
|
counters present.
|
||||||
|
- If the kernel is entered at EL1:
|
||||||
|
AMCNTENSET0_EL0 must be initialised to 0b1111
|
||||||
|
AMCNTENSET1_EL0 must be initialised to a platform specific value
|
||||||
|
having 0b1 set for the corresponding bit for each of the auxiliary
|
||||||
|
counters present.
|
||||||
|
|
||||||
The requirements described above for CPU mode, caches, MMUs, architected
|
The requirements described above for CPU mode, caches, MMUs, architected
|
||||||
timers, coherency and system registers apply to all CPUs. All CPUs must
|
timers, coherency and system registers apply to all CPUs. All CPUs must
|
||||||
enter the kernel in the same exception level.
|
enter the kernel in the same exception level.
|
||||||
|
|
|
@ -6,6 +6,7 @@ ARM64 Architecture
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
|
|
||||||
acpi_object_usage
|
acpi_object_usage
|
||||||
|
amu
|
||||||
arm-acpi
|
arm-acpi
|
||||||
booting
|
booting
|
||||||
cpu-feature-registers
|
cpu-feature-registers
|
||||||
|
|
|
@ -129,7 +129,7 @@ this logic.
|
||||||
|
|
||||||
As a single binary will need to support both 48-bit and 52-bit VA
|
As a single binary will need to support both 48-bit and 52-bit VA
|
||||||
spaces, the VMEMMAP must be sized large enough for 52-bit VAs and
|
spaces, the VMEMMAP must be sized large enough for 52-bit VAs and
|
||||||
also must be sized large enought to accommodate a fixed PAGE_OFFSET.
|
also must be sized large enough to accommodate a fixed PAGE_OFFSET.
|
||||||
|
|
||||||
Most code in the kernel should not need to consider the VA_BITS, for
|
Most code in the kernel should not need to consider the VA_BITS, for
|
||||||
code that does need to know the VA size the variables are
|
code that does need to know the VA size the variables are
|
||||||
|
|
|
@ -110,6 +110,8 @@ stable kernels.
|
||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 |
|
| Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 |
|
||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
|
| Cavium | ThunderX GICv3 | #38539 | N/A |
|
||||||
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 |
|
| Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 |
|
||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| Cavium | ThunderX Core | #30115 | CAVIUM_ERRATUM_30115 |
|
| Cavium | ThunderX Core | #30115 | CAVIUM_ERRATUM_30115 |
|
||||||
|
|
|
@ -44,8 +44,15 @@ The AArch64 Tagged Address ABI has two stages of relaxation depending
|
||||||
how the user addresses are used by the kernel:
|
how the user addresses are used by the kernel:
|
||||||
|
|
||||||
1. User addresses not accessed by the kernel but used for address space
|
1. User addresses not accessed by the kernel but used for address space
|
||||||
management (e.g. ``mmap()``, ``mprotect()``, ``madvise()``). The use
|
management (e.g. ``mprotect()``, ``madvise()``). The use of valid
|
||||||
of valid tagged pointers in this context is always allowed.
|
tagged pointers in this context is allowed with the exception of
|
||||||
|
``brk()``, ``mmap()`` and the ``new_address`` argument to
|
||||||
|
``mremap()`` as these have the potential to alias with existing
|
||||||
|
user addresses.
|
||||||
|
|
||||||
|
NOTE: This behaviour changed in v5.6 and so some earlier kernels may
|
||||||
|
incorrectly accept valid tagged pointers for the ``brk()``,
|
||||||
|
``mmap()`` and ``mremap()`` system calls.
|
||||||
|
|
||||||
2. User addresses accessed by the kernel (e.g. ``write()``). This ABI
|
2. User addresses accessed by the kernel (e.g. ``write()``). This ABI
|
||||||
relaxation is disabled by default and the application thread needs to
|
relaxation is disabled by default and the application thread needs to
|
||||||
|
|
|
@ -2,17 +2,9 @@
|
||||||
Generic Block Device Capability
|
Generic Block Device Capability
|
||||||
===============================
|
===============================
|
||||||
|
|
||||||
This file documents the sysfs file block/<disk>/capability
|
This file documents the sysfs file ``block/<disk>/capability``.
|
||||||
|
|
||||||
capability is a hex word indicating which capabilities a specific disk
|
``capability`` is a bitfield, printed in hexadecimal, indicating which
|
||||||
supports. For more information on bits not listed here, see
|
capabilities a specific block device supports:
|
||||||
include/linux/genhd.h
|
|
||||||
|
|
||||||
GENHD_FL_MEDIA_CHANGE_NOTIFY
|
.. kernel-doc:: include/linux/genhd.h
|
||||||
----------------------------
|
|
||||||
|
|
||||||
Value: 4
|
|
||||||
|
|
||||||
When this bit is set, the disk supports Asynchronous Notification
|
|
||||||
of media change events. These events will be broadcast to user
|
|
||||||
space via kernel uevent.
|
|
||||||
|
|
|
@ -20,11 +20,11 @@ Reporting bugs
|
||||||
Q: How do I report bugs for BPF kernel code?
|
Q: How do I report bugs for BPF kernel code?
|
||||||
--------------------------------------------
|
--------------------------------------------
|
||||||
A: Since all BPF kernel development as well as bpftool and iproute2 BPF
|
A: Since all BPF kernel development as well as bpftool and iproute2 BPF
|
||||||
loader development happens through the netdev kernel mailing list,
|
loader development happens through the bpf kernel mailing list,
|
||||||
please report any found issues around BPF to the following mailing
|
please report any found issues around BPF to the following mailing
|
||||||
list:
|
list:
|
||||||
|
|
||||||
netdev@vger.kernel.org
|
bpf@vger.kernel.org
|
||||||
|
|
||||||
This may also include issues related to XDP, BPF tracing, etc.
|
This may also include issues related to XDP, BPF tracing, etc.
|
||||||
|
|
||||||
|
@ -46,17 +46,12 @@ Submitting patches
|
||||||
|
|
||||||
Q: To which mailing list do I need to submit my BPF patches?
|
Q: To which mailing list do I need to submit my BPF patches?
|
||||||
------------------------------------------------------------
|
------------------------------------------------------------
|
||||||
A: Please submit your BPF patches to the netdev kernel mailing list:
|
A: Please submit your BPF patches to the bpf kernel mailing list:
|
||||||
|
|
||||||
netdev@vger.kernel.org
|
bpf@vger.kernel.org
|
||||||
|
|
||||||
Historically, BPF came out of networking and has always been maintained
|
|
||||||
by the kernel networking community. Although these days BPF touches
|
|
||||||
many other subsystems as well, the patches are still routed mainly
|
|
||||||
through the networking community.
|
|
||||||
|
|
||||||
In case your patch has changes in various different subsystems (e.g.
|
In case your patch has changes in various different subsystems (e.g.
|
||||||
tracing, security, etc), make sure to Cc the related kernel mailing
|
networking, tracing, security, etc), make sure to Cc the related kernel mailing
|
||||||
lists and maintainers from there as well, so they are able to review
|
lists and maintainers from there as well, so they are able to review
|
||||||
the changes and provide their Acked-by's to the patches.
|
the changes and provide their Acked-by's to the patches.
|
||||||
|
|
||||||
|
@ -168,7 +163,7 @@ a BPF point of view.
|
||||||
Be aware that this is not a final verdict that the patch will
|
Be aware that this is not a final verdict that the patch will
|
||||||
automatically get accepted into net or net-next trees eventually:
|
automatically get accepted into net or net-next trees eventually:
|
||||||
|
|
||||||
On the netdev kernel mailing list reviews can come in at any point
|
On the bpf kernel mailing list reviews can come in at any point
|
||||||
in time. If discussions around a patch conclude that they cannot
|
in time. If discussions around a patch conclude that they cannot
|
||||||
get included as-is, we will either apply a follow-up fix or drop
|
get included as-is, we will either apply a follow-up fix or drop
|
||||||
them from the trees entirely. Therefore, we also reserve to rebase
|
them from the trees entirely. Therefore, we also reserve to rebase
|
||||||
|
@ -494,15 +489,15 @@ A: You need cmake and gcc-c++ as build requisites for LLVM. Once you have
|
||||||
that set up, proceed with building the latest LLVM and clang version
|
that set up, proceed with building the latest LLVM and clang version
|
||||||
from the git repositories::
|
from the git repositories::
|
||||||
|
|
||||||
$ git clone http://llvm.org/git/llvm.git
|
$ git clone https://github.com/llvm/llvm-project.git
|
||||||
$ cd llvm/tools
|
$ mkdir -p llvm-project/llvm/build/install
|
||||||
$ git clone --depth 1 http://llvm.org/git/clang.git
|
$ cd llvm-project/llvm/build
|
||||||
$ cd ..; mkdir build; cd build
|
$ cmake .. -G "Ninja" -DLLVM_TARGETS_TO_BUILD="BPF;X86" \
|
||||||
$ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" \
|
-DLLVM_ENABLE_PROJECTS="clang" \
|
||||||
-DBUILD_SHARED_LIBS=OFF \
|
-DBUILD_SHARED_LIBS=OFF \
|
||||||
-DCMAKE_BUILD_TYPE=Release \
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
-DLLVM_BUILD_RUNTIME=OFF
|
-DLLVM_BUILD_RUNTIME=OFF
|
||||||
$ make -j $(getconf _NPROCESSORS_ONLN)
|
$ ninja
|
||||||
|
|
||||||
The built binaries can then be found in the build/bin/ directory, where
|
The built binaries can then be found in the build/bin/ directory, where
|
||||||
you can point the PATH variable to.
|
you can point the PATH variable to.
|
||||||
|
|
142
Documentation/bpf/bpf_lsm.rst
Normal file
142
Documentation/bpf/bpf_lsm.rst
Normal file
|
@ -0,0 +1,142 @@
|
||||||
|
.. SPDX-License-Identifier: GPL-2.0+
|
||||||
|
.. Copyright (C) 2020 Google LLC.
|
||||||
|
|
||||||
|
================
|
||||||
|
LSM BPF Programs
|
||||||
|
================
|
||||||
|
|
||||||
|
These BPF programs allow runtime instrumentation of the LSM hooks by privileged
|
||||||
|
users to implement system-wide MAC (Mandatory Access Control) and Audit
|
||||||
|
policies using eBPF.
|
||||||
|
|
||||||
|
Structure
|
||||||
|
---------
|
||||||
|
|
||||||
|
The example shows an eBPF program that can be attached to the ``file_mprotect``
|
||||||
|
LSM hook:
|
||||||
|
|
||||||
|
.. c:function:: int file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot);
|
||||||
|
|
||||||
|
Other LSM hooks which can be instrumented can be found in
|
||||||
|
``include/linux/lsm_hooks.h``.
|
||||||
|
|
||||||
|
eBPF programs that use :doc:`/bpf/btf` do not need to include kernel headers
|
||||||
|
for accessing information from the attached eBPF program's context. They can
|
||||||
|
simply declare the structures in the eBPF program and only specify the fields
|
||||||
|
that need to be accessed.
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
struct mm_struct {
|
||||||
|
unsigned long start_brk, brk, start_stack;
|
||||||
|
} __attribute__((preserve_access_index));
|
||||||
|
|
||||||
|
struct vm_area_struct {
|
||||||
|
unsigned long start_brk, brk, start_stack;
|
||||||
|
unsigned long vm_start, vm_end;
|
||||||
|
struct mm_struct *vm_mm;
|
||||||
|
} __attribute__((preserve_access_index));
|
||||||
|
|
||||||
|
|
||||||
|
.. note:: The order of the fields is irrelevant.
|
||||||
|
|
||||||
|
This can be further simplified (if one has access to the BTF information at
|
||||||
|
build time) by generating the ``vmlinux.h`` with:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
# bpftool btf dump file <path-to-btf-vmlinux> format c > vmlinux.h
|
||||||
|
|
||||||
|
.. note:: ``path-to-btf-vmlinux`` can be ``/sys/kernel/btf/vmlinux`` if the
|
||||||
|
build environment matches the environment the BPF programs are
|
||||||
|
deployed in.
|
||||||
|
|
||||||
|
The ``vmlinux.h`` can then simply be included in the BPF programs without
|
||||||
|
requiring the definition of the types.
|
||||||
|
|
||||||
|
The eBPF programs can be declared using the``BPF_PROG``
|
||||||
|
macros defined in `tools/lib/bpf/bpf_tracing.h`_. In this
|
||||||
|
example:
|
||||||
|
|
||||||
|
* ``"lsm/file_mprotect"`` indicates the LSM hook that the program must
|
||||||
|
be attached to
|
||||||
|
* ``mprotect_audit`` is the name of the eBPF program
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
SEC("lsm/file_mprotect")
|
||||||
|
int BPF_PROG(mprotect_audit, struct vm_area_struct *vma,
|
||||||
|
unsigned long reqprot, unsigned long prot, int ret)
|
||||||
|
{
|
||||||
|
/* ret is the return value from the previous BPF program
|
||||||
|
* or 0 if it's the first hook.
|
||||||
|
*/
|
||||||
|
if (ret != 0)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
int is_heap;
|
||||||
|
|
||||||
|
is_heap = (vma->vm_start >= vma->vm_mm->start_brk &&
|
||||||
|
vma->vm_end <= vma->vm_mm->brk);
|
||||||
|
|
||||||
|
/* Return an -EPERM or write information to the perf events buffer
|
||||||
|
* for auditing
|
||||||
|
*/
|
||||||
|
if (is_heap)
|
||||||
|
return -EPERM;
|
||||||
|
}
|
||||||
|
|
||||||
|
The ``__attribute__((preserve_access_index))`` is a clang feature that allows
|
||||||
|
the BPF verifier to update the offsets for the access at runtime using the
|
||||||
|
:doc:`/bpf/btf` information. Since the BPF verifier is aware of the types, it
|
||||||
|
also validates all the accesses made to the various types in the eBPF program.
|
||||||
|
|
||||||
|
Loading
|
||||||
|
-------
|
||||||
|
|
||||||
|
eBPF programs can be loaded with the :manpage:`bpf(2)` syscall's
|
||||||
|
``BPF_PROG_LOAD`` operation:
|
||||||
|
|
||||||
|
.. code-block:: c
|
||||||
|
|
||||||
|
struct bpf_object *obj;
|
||||||
|
|
||||||
|
obj = bpf_object__open("./my_prog.o");
|
||||||
|
bpf_object__load(obj);
|
||||||
|
|
||||||
|
This can be simplified by using a skeleton header generated by ``bpftool``:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
# bpftool gen skeleton my_prog.o > my_prog.skel.h
|
||||||
|
|
||||||
|
and the program can be loaded by including ``my_prog.skel.h`` and using
|
||||||
|
the generated helper, ``my_prog__open_and_load``.
|
||||||
|
|
||||||
|
Attachment to LSM Hooks
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
The LSM allows attachment of eBPF programs as LSM hooks using :manpage:`bpf(2)`
|
||||||
|
syscall's ``BPF_RAW_TRACEPOINT_OPEN`` operation or more simply by
|
||||||
|
using the libbpf helper ``bpf_program__attach_lsm``.
|
||||||
|
|
||||||
|
The program can be detached from the LSM hook by *destroying* the ``link``
|
||||||
|
link returned by ``bpf_program__attach_lsm`` using ``bpf_link__destroy``.
|
||||||
|
|
||||||
|
One can also use the helpers generated in ``my_prog.skel.h`` i.e.
|
||||||
|
``my_prog__attach`` for attachment and ``my_prog__destroy`` for cleaning up.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
|
An example eBPF program can be found in
|
||||||
|
`tools/testing/selftests/bpf/progs/lsm.c`_ and the corresponding
|
||||||
|
userspace code in `tools/testing/selftests/bpf/prog_tests/test_lsm.c`_
|
||||||
|
|
||||||
|
.. Links
|
||||||
|
.. _tools/lib/bpf/bpf_tracing.h:
|
||||||
|
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/lib/bpf/bpf_tracing.h
|
||||||
|
.. _tools/testing/selftests/bpf/progs/lsm.c:
|
||||||
|
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/progs/lsm.c
|
||||||
|
.. _tools/testing/selftests/bpf/prog_tests/test_lsm.c:
|
||||||
|
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/prog_tests/test_lsm.c
|
213
Documentation/bpf/drgn.rst
Normal file
213
Documentation/bpf/drgn.rst
Normal file
|
@ -0,0 +1,213 @@
|
||||||
|
.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
|
||||||
|
|
||||||
|
==============
|
||||||
|
BPF drgn tools
|
||||||
|
==============
|
||||||
|
|
||||||
|
drgn scripts is a convenient and easy to use mechanism to retrieve arbitrary
|
||||||
|
kernel data structures. drgn is not relying on kernel UAPI to read the data.
|
||||||
|
Instead it's reading directly from ``/proc/kcore`` or vmcore and pretty prints
|
||||||
|
the data based on DWARF debug information from vmlinux.
|
||||||
|
|
||||||
|
This document describes BPF related drgn tools.
|
||||||
|
|
||||||
|
See `drgn/tools`_ for all tools available at the moment and `drgn/doc`_ for
|
||||||
|
more details on drgn itself.
|
||||||
|
|
||||||
|
bpf_inspect.py
|
||||||
|
--------------
|
||||||
|
|
||||||
|
Description
|
||||||
|
===========
|
||||||
|
|
||||||
|
`bpf_inspect.py`_ is a tool intended to inspect BPF programs and maps. It can
|
||||||
|
iterate over all programs and maps in the system and print basic information
|
||||||
|
about these objects, including id, type and name.
|
||||||
|
|
||||||
|
The main use-case `bpf_inspect.py`_ covers is to show BPF programs of types
|
||||||
|
``BPF_PROG_TYPE_EXT`` and ``BPF_PROG_TYPE_TRACING`` attached to other BPF
|
||||||
|
programs via ``freplace``/``fentry``/``fexit`` mechanisms, since there is no
|
||||||
|
user-space API to get this information.
|
||||||
|
|
||||||
|
Getting started
|
||||||
|
===============
|
||||||
|
|
||||||
|
List BPF programs (full names are obtained from BTF)::
|
||||||
|
|
||||||
|
% sudo bpf_inspect.py prog
|
||||||
|
27: BPF_PROG_TYPE_TRACEPOINT tracepoint__tcp__tcp_send_reset
|
||||||
|
4632: BPF_PROG_TYPE_CGROUP_SOCK_ADDR tw_ipt_bind
|
||||||
|
49464: BPF_PROG_TYPE_RAW_TRACEPOINT raw_tracepoint__sched_process_exit
|
||||||
|
|
||||||
|
List BPF maps::
|
||||||
|
|
||||||
|
% sudo bpf_inspect.py map
|
||||||
|
2577: BPF_MAP_TYPE_HASH tw_ipt_vips
|
||||||
|
4050: BPF_MAP_TYPE_STACK_TRACE stack_traces
|
||||||
|
4069: BPF_MAP_TYPE_PERCPU_ARRAY ned_dctcp_cntr
|
||||||
|
|
||||||
|
Find BPF programs attached to BPF program ``test_pkt_access``::
|
||||||
|
|
||||||
|
% sudo bpf_inspect.py p | grep test_pkt_access
|
||||||
|
650: BPF_PROG_TYPE_SCHED_CLS test_pkt_access
|
||||||
|
654: BPF_PROG_TYPE_TRACING test_main linked:[650->25: BPF_TRAMP_FEXIT test_pkt_access->test_pkt_access()]
|
||||||
|
655: BPF_PROG_TYPE_TRACING test_subprog1 linked:[650->29: BPF_TRAMP_FEXIT test_pkt_access->test_pkt_access_subprog1()]
|
||||||
|
656: BPF_PROG_TYPE_TRACING test_subprog2 linked:[650->31: BPF_TRAMP_FEXIT test_pkt_access->test_pkt_access_subprog2()]
|
||||||
|
657: BPF_PROG_TYPE_TRACING test_subprog3 linked:[650->21: BPF_TRAMP_FEXIT test_pkt_access->test_pkt_access_subprog3()]
|
||||||
|
658: BPF_PROG_TYPE_EXT new_get_skb_len linked:[650->16: BPF_TRAMP_REPLACE test_pkt_access->get_skb_len()]
|
||||||
|
659: BPF_PROG_TYPE_EXT new_get_skb_ifindex linked:[650->23: BPF_TRAMP_REPLACE test_pkt_access->get_skb_ifindex()]
|
||||||
|
660: BPF_PROG_TYPE_EXT new_get_constant linked:[650->19: BPF_TRAMP_REPLACE test_pkt_access->get_constant()]
|
||||||
|
|
||||||
|
It can be seen that there is a program ``test_pkt_access``, id 650 and there
|
||||||
|
are multiple other tracing and ext programs attached to functions in
|
||||||
|
``test_pkt_access``.
|
||||||
|
|
||||||
|
For example the line::
|
||||||
|
|
||||||
|
658: BPF_PROG_TYPE_EXT new_get_skb_len linked:[650->16: BPF_TRAMP_REPLACE test_pkt_access->get_skb_len()]
|
||||||
|
|
||||||
|
, means that BPF program id 658, type ``BPF_PROG_TYPE_EXT``, name
|
||||||
|
``new_get_skb_len`` replaces (``BPF_TRAMP_REPLACE``) function ``get_skb_len()``
|
||||||
|
that has BTF id 16 in BPF program id 650, name ``test_pkt_access``.
|
||||||
|
|
||||||
|
Getting help:
|
||||||
|
|
||||||
|
.. code-block:: none
|
||||||
|
|
||||||
|
% sudo bpf_inspect.py
|
||||||
|
usage: bpf_inspect.py [-h] {prog,p,map,m} ...
|
||||||
|
|
||||||
|
drgn script to list BPF programs or maps and their properties
|
||||||
|
unavailable via kernel API.
|
||||||
|
|
||||||
|
See https://github.com/osandov/drgn/ for more details on drgn.
|
||||||
|
|
||||||
|
optional arguments:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
|
||||||
|
subcommands:
|
||||||
|
{prog,p,map,m}
|
||||||
|
prog (p) list BPF programs
|
||||||
|
map (m) list BPF maps
|
||||||
|
|
||||||
|
Customization
|
||||||
|
=============
|
||||||
|
|
||||||
|
The script is intended to be customized by developers to print relevant
|
||||||
|
information about BPF programs, maps and other objects.
|
||||||
|
|
||||||
|
For example, to print ``struct bpf_prog_aux`` for BPF program id 53077:
|
||||||
|
|
||||||
|
.. code-block:: none
|
||||||
|
|
||||||
|
% git diff
|
||||||
|
diff --git a/tools/bpf_inspect.py b/tools/bpf_inspect.py
|
||||||
|
index 650e228..aea2357 100755
|
||||||
|
--- a/tools/bpf_inspect.py
|
||||||
|
+++ b/tools/bpf_inspect.py
|
||||||
|
@@ -112,7 +112,9 @@ def list_bpf_progs(args):
|
||||||
|
if linked:
|
||||||
|
linked = f" linked:[{linked}]"
|
||||||
|
|
||||||
|
- print(f"{id_:>6}: {type_:32} {name:32} {linked}")
|
||||||
|
+ if id_ == 53077:
|
||||||
|
+ print(f"{id_:>6}: {type_:32} {name:32}")
|
||||||
|
+ print(f"{bpf_prog.aux}")
|
||||||
|
|
||||||
|
|
||||||
|
def list_bpf_maps(args):
|
||||||
|
|
||||||
|
It produces the output::
|
||||||
|
|
||||||
|
% sudo bpf_inspect.py p
|
||||||
|
53077: BPF_PROG_TYPE_XDP tw_xdp_policer
|
||||||
|
*(struct bpf_prog_aux *)0xffff8893fad4b400 = {
|
||||||
|
.refcnt = (atomic64_t){
|
||||||
|
.counter = (long)58,
|
||||||
|
},
|
||||||
|
.used_map_cnt = (u32)1,
|
||||||
|
.max_ctx_offset = (u32)8,
|
||||||
|
.max_pkt_offset = (u32)15,
|
||||||
|
.max_tp_access = (u32)0,
|
||||||
|
.stack_depth = (u32)8,
|
||||||
|
.id = (u32)53077,
|
||||||
|
.func_cnt = (u32)0,
|
||||||
|
.func_idx = (u32)0,
|
||||||
|
.attach_btf_id = (u32)0,
|
||||||
|
.linked_prog = (struct bpf_prog *)0x0,
|
||||||
|
.verifier_zext = (bool)0,
|
||||||
|
.offload_requested = (bool)0,
|
||||||
|
.attach_btf_trace = (bool)0,
|
||||||
|
.func_proto_unreliable = (bool)0,
|
||||||
|
.trampoline_prog_type = (enum bpf_tramp_prog_type)BPF_TRAMP_FENTRY,
|
||||||
|
.trampoline = (struct bpf_trampoline *)0x0,
|
||||||
|
.tramp_hlist = (struct hlist_node){
|
||||||
|
.next = (struct hlist_node *)0x0,
|
||||||
|
.pprev = (struct hlist_node **)0x0,
|
||||||
|
},
|
||||||
|
.attach_func_proto = (const struct btf_type *)0x0,
|
||||||
|
.attach_func_name = (const char *)0x0,
|
||||||
|
.func = (struct bpf_prog **)0x0,
|
||||||
|
.jit_data = (void *)0x0,
|
||||||
|
.poke_tab = (struct bpf_jit_poke_descriptor *)0x0,
|
||||||
|
.size_poke_tab = (u32)0,
|
||||||
|
.ksym_tnode = (struct latch_tree_node){
|
||||||
|
.node = (struct rb_node [2]){
|
||||||
|
{
|
||||||
|
.__rb_parent_color = (unsigned long)18446612956263126665,
|
||||||
|
.rb_right = (struct rb_node *)0x0,
|
||||||
|
.rb_left = (struct rb_node *)0xffff88a0be3d0088,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
.__rb_parent_color = (unsigned long)18446612956263126689,
|
||||||
|
.rb_right = (struct rb_node *)0x0,
|
||||||
|
.rb_left = (struct rb_node *)0xffff88a0be3d00a0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
.ksym_lnode = (struct list_head){
|
||||||
|
.next = (struct list_head *)0xffff88bf481830b8,
|
||||||
|
.prev = (struct list_head *)0xffff888309f536b8,
|
||||||
|
},
|
||||||
|
.ops = (const struct bpf_prog_ops *)xdp_prog_ops+0x0 = 0xffffffff820fa350,
|
||||||
|
.used_maps = (struct bpf_map **)0xffff889ff795de98,
|
||||||
|
.prog = (struct bpf_prog *)0xffffc9000cf2d000,
|
||||||
|
.user = (struct user_struct *)root_user+0x0 = 0xffffffff82444820,
|
||||||
|
.load_time = (u64)2408348759285319,
|
||||||
|
.cgroup_storage = (struct bpf_map *[2]){},
|
||||||
|
.name = (char [16])"tw_xdp_policer",
|
||||||
|
.security = (void *)0xffff889ff795d548,
|
||||||
|
.offload = (struct bpf_prog_offload *)0x0,
|
||||||
|
.btf = (struct btf *)0xffff8890ce6d0580,
|
||||||
|
.func_info = (struct bpf_func_info *)0xffff889ff795d240,
|
||||||
|
.func_info_aux = (struct bpf_func_info_aux *)0xffff889ff795de20,
|
||||||
|
.linfo = (struct bpf_line_info *)0xffff888a707afc00,
|
||||||
|
.jited_linfo = (void **)0xffff8893fad48600,
|
||||||
|
.func_info_cnt = (u32)1,
|
||||||
|
.nr_linfo = (u32)37,
|
||||||
|
.linfo_idx = (u32)0,
|
||||||
|
.num_exentries = (u32)0,
|
||||||
|
.extable = (struct exception_table_entry *)0xffffffffa032d950,
|
||||||
|
.stats = (struct bpf_prog_stats *)0x603fe3a1f6d0,
|
||||||
|
.work = (struct work_struct){
|
||||||
|
.data = (atomic_long_t){
|
||||||
|
.counter = (long)0,
|
||||||
|
},
|
||||||
|
.entry = (struct list_head){
|
||||||
|
.next = (struct list_head *)0x0,
|
||||||
|
.prev = (struct list_head *)0x0,
|
||||||
|
},
|
||||||
|
.func = (work_func_t)0x0,
|
||||||
|
},
|
||||||
|
.rcu = (struct callback_head){
|
||||||
|
.next = (struct callback_head *)0x0,
|
||||||
|
.func = (void (*)(struct callback_head *))0x0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
.. Links
|
||||||
|
.. _drgn/doc: https://drgn.readthedocs.io/en/latest/
|
||||||
|
.. _drgn/tools: https://github.com/osandov/drgn/tree/master/tools
|
||||||
|
.. _bpf_inspect.py:
|
||||||
|
https://github.com/osandov/drgn/blob/master/tools/bpf_inspect.py
|
|
@ -45,14 +45,16 @@ Program types
|
||||||
prog_cgroup_sockopt
|
prog_cgroup_sockopt
|
||||||
prog_cgroup_sysctl
|
prog_cgroup_sysctl
|
||||||
prog_flow_dissector
|
prog_flow_dissector
|
||||||
|
bpf_lsm
|
||||||
|
|
||||||
|
|
||||||
Testing BPF
|
Testing and debugging BPF
|
||||||
===========
|
=========================
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
|
|
||||||
|
drgn
|
||||||
s390
|
s390
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,11 @@ needs_sphinx = '1.3'
|
||||||
# ones.
|
# ones.
|
||||||
extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain',
|
extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain',
|
||||||
'kfigure', 'sphinx.ext.ifconfig', 'automarkup',
|
'kfigure', 'sphinx.ext.ifconfig', 'automarkup',
|
||||||
'maintainers_include']
|
'maintainers_include', 'sphinx.ext.autosectionlabel' ]
|
||||||
|
|
||||||
|
# Ensure that autosectionlabel will produce unique names
|
||||||
|
autosectionlabel_prefix_document = True
|
||||||
|
autosectionlabel_maxdepth = 2
|
||||||
|
|
||||||
# The name of the math extension changed on Sphinx 1.4
|
# The name of the math extension changed on Sphinx 1.4
|
||||||
if (major == 1 and minor > 3) or (major > 1):
|
if (major == 1 and minor > 3) or (major > 1):
|
||||||
|
|
|
@ -8,41 +8,81 @@ This is the beginning of a manual for core kernel APIs. The conversion
|
||||||
Core utilities
|
Core utilities
|
||||||
==============
|
==============
|
||||||
|
|
||||||
|
This section has general and "core core" documentation. The first is a
|
||||||
|
massive grab-bag of kerneldoc info left over from the docbook days; it
|
||||||
|
should really be broken up someday when somebody finds the energy to do
|
||||||
|
it.
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
|
|
||||||
kernel-api
|
kernel-api
|
||||||
assoc_array
|
|
||||||
atomic_ops
|
|
||||||
cachetlb
|
|
||||||
refcount-vs-atomic
|
|
||||||
cpu_hotplug
|
|
||||||
idr
|
|
||||||
local_ops
|
|
||||||
workqueue
|
workqueue
|
||||||
genericirq
|
|
||||||
xarray
|
|
||||||
librs
|
|
||||||
genalloc
|
|
||||||
errseq
|
|
||||||
packing
|
|
||||||
printk-formats
|
printk-formats
|
||||||
|
symbol-namespaces
|
||||||
|
|
||||||
|
Data structures and low-level utilities
|
||||||
|
=======================================
|
||||||
|
|
||||||
|
Library functionality that is used throughout the kernel.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
kobject
|
||||||
|
assoc_array
|
||||||
|
xarray
|
||||||
|
idr
|
||||||
circular-buffers
|
circular-buffers
|
||||||
generic-radix-tree
|
generic-radix-tree
|
||||||
|
packing
|
||||||
|
timekeeping
|
||||||
|
errseq
|
||||||
|
|
||||||
|
Concurrency primitives
|
||||||
|
======================
|
||||||
|
|
||||||
|
How Linux keeps everything from happening at the same time. See
|
||||||
|
:doc:`/locking/index` for more related documentation.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
atomic_ops
|
||||||
|
refcount-vs-atomic
|
||||||
|
local_ops
|
||||||
|
padata
|
||||||
|
../RCU/index
|
||||||
|
|
||||||
|
Low-level hardware management
|
||||||
|
=============================
|
||||||
|
|
||||||
|
Cache management, managing CPU hotplug, etc.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
cachetlb
|
||||||
|
cpu_hotplug
|
||||||
|
memory-hotplug
|
||||||
|
genericirq
|
||||||
|
protection-keys
|
||||||
|
|
||||||
|
Memory management
|
||||||
|
=================
|
||||||
|
|
||||||
|
How to allocate and use memory in the kernel. Note that there is a lot
|
||||||
|
more memory-management documentation in :doc:`/vm/index`.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
memory-allocation
|
memory-allocation
|
||||||
mm-api
|
mm-api
|
||||||
|
genalloc
|
||||||
pin_user_pages
|
pin_user_pages
|
||||||
gfp_mask-from-fs-io
|
|
||||||
timekeeping
|
|
||||||
boot-time-mm
|
boot-time-mm
|
||||||
memory-hotplug
|
gfp_mask-from-fs-io
|
||||||
protection-keys
|
|
||||||
../RCU/index
|
|
||||||
gcc-plugins
|
|
||||||
symbol-namespaces
|
|
||||||
padata
|
|
||||||
ioctl
|
|
||||||
|
|
||||||
|
|
||||||
Interfaces for kernel debugging
|
Interfaces for kernel debugging
|
||||||
===============================
|
===============================
|
||||||
|
@ -53,6 +93,16 @@ Interfaces for kernel debugging
|
||||||
debug-objects
|
debug-objects
|
||||||
tracepoint
|
tracepoint
|
||||||
|
|
||||||
|
Everything else
|
||||||
|
===============
|
||||||
|
|
||||||
|
Documents that don't fit elsewhere or which have yet to be categorized.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
librs
|
||||||
|
|
||||||
.. only:: subproject and html
|
.. only:: subproject and html
|
||||||
|
|
||||||
Indices
|
Indices
|
||||||
|
|
|
@ -25,7 +25,7 @@ some terms we will be working with.
|
||||||
usually embedded within some other structure which contains the stuff
|
usually embedded within some other structure which contains the stuff
|
||||||
the code is really interested in.
|
the code is really interested in.
|
||||||
|
|
||||||
No structure should EVER have more than one kobject embedded within it.
|
No structure should **EVER** have more than one kobject embedded within it.
|
||||||
If it does, the reference counting for the object is sure to be messed
|
If it does, the reference counting for the object is sure to be messed
|
||||||
up and incorrect, and your code will be buggy. So do not do this.
|
up and incorrect, and your code will be buggy. So do not do this.
|
||||||
|
|
||||||
|
@ -55,7 +55,7 @@ a larger, domain-specific object. To this end, kobjects will be found
|
||||||
embedded in other structures. If you are used to thinking of things in
|
embedded in other structures. If you are used to thinking of things in
|
||||||
object-oriented terms, kobjects can be seen as a top-level, abstract class
|
object-oriented terms, kobjects can be seen as a top-level, abstract class
|
||||||
from which other classes are derived. A kobject implements a set of
|
from which other classes are derived. A kobject implements a set of
|
||||||
capabilities which are not particularly useful by themselves, but which are
|
capabilities which are not particularly useful by themselves, but are
|
||||||
nice to have in other objects. The C language does not allow for the
|
nice to have in other objects. The C language does not allow for the
|
||||||
direct expression of inheritance, so other techniques - such as structure
|
direct expression of inheritance, so other techniques - such as structure
|
||||||
embedding - must be used.
|
embedding - must be used.
|
||||||
|
@ -65,12 +65,12 @@ this is analogous as to how "list_head" structs are rarely useful on
|
||||||
their own, but are invariably found embedded in the larger objects of
|
their own, but are invariably found embedded in the larger objects of
|
||||||
interest.)
|
interest.)
|
||||||
|
|
||||||
So, for example, the UIO code in drivers/uio/uio.c has a structure that
|
So, for example, the UIO code in ``drivers/uio/uio.c`` has a structure that
|
||||||
defines the memory region associated with a uio device::
|
defines the memory region associated with a uio device::
|
||||||
|
|
||||||
struct uio_map {
|
struct uio_map {
|
||||||
struct kobject kobj;
|
struct kobject kobj;
|
||||||
struct uio_mem *mem;
|
struct uio_mem *mem;
|
||||||
};
|
};
|
||||||
|
|
||||||
If you have a struct uio_map structure, finding its embedded kobject is
|
If you have a struct uio_map structure, finding its embedded kobject is
|
||||||
|
@ -78,30 +78,30 @@ just a matter of using the kobj member. Code that works with kobjects will
|
||||||
often have the opposite problem, however: given a struct kobject pointer,
|
often have the opposite problem, however: given a struct kobject pointer,
|
||||||
what is the pointer to the containing structure? You must avoid tricks
|
what is the pointer to the containing structure? You must avoid tricks
|
||||||
(such as assuming that the kobject is at the beginning of the structure)
|
(such as assuming that the kobject is at the beginning of the structure)
|
||||||
and, instead, use the container_of() macro, found in <linux/kernel.h>::
|
and, instead, use the container_of() macro, found in ``<linux/kernel.h>``::
|
||||||
|
|
||||||
container_of(pointer, type, member)
|
container_of(pointer, type, member)
|
||||||
|
|
||||||
where:
|
where:
|
||||||
|
|
||||||
* "pointer" is the pointer to the embedded kobject,
|
* ``pointer`` is the pointer to the embedded kobject,
|
||||||
* "type" is the type of the containing structure, and
|
* ``type`` is the type of the containing structure, and
|
||||||
* "member" is the name of the structure field to which "pointer" points.
|
* ``member`` is the name of the structure field to which ``pointer`` points.
|
||||||
|
|
||||||
The return value from container_of() is a pointer to the corresponding
|
The return value from container_of() is a pointer to the corresponding
|
||||||
container type. So, for example, a pointer "kp" to a struct kobject
|
container type. So, for example, a pointer ``kp`` to a struct kobject
|
||||||
embedded *within* a struct uio_map could be converted to a pointer to the
|
embedded **within** a struct uio_map could be converted to a pointer to the
|
||||||
*containing* uio_map structure with::
|
**containing** uio_map structure with::
|
||||||
|
|
||||||
struct uio_map *u_map = container_of(kp, struct uio_map, kobj);
|
struct uio_map *u_map = container_of(kp, struct uio_map, kobj);
|
||||||
|
|
||||||
For convenience, programmers often define a simple macro for "back-casting"
|
For convenience, programmers often define a simple macro for **back-casting**
|
||||||
kobject pointers to the containing type. Exactly this happens in the
|
kobject pointers to the containing type. Exactly this happens in the
|
||||||
earlier drivers/uio/uio.c, as you can see here::
|
earlier ``drivers/uio/uio.c``, as you can see here::
|
||||||
|
|
||||||
struct uio_map {
|
struct uio_map {
|
||||||
struct kobject kobj;
|
struct kobject kobj;
|
||||||
struct uio_mem *mem;
|
struct uio_mem *mem;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define to_map(map) container_of(map, struct uio_map, kobj)
|
#define to_map(map) container_of(map, struct uio_map, kobj)
|
||||||
|
@ -125,7 +125,7 @@ must have an associated kobj_type. After calling kobject_init(), to
|
||||||
register the kobject with sysfs, the function kobject_add() must be called::
|
register the kobject with sysfs, the function kobject_add() must be called::
|
||||||
|
|
||||||
int kobject_add(struct kobject *kobj, struct kobject *parent,
|
int kobject_add(struct kobject *kobj, struct kobject *parent,
|
||||||
const char *fmt, ...);
|
const char *fmt, ...);
|
||||||
|
|
||||||
This sets up the parent of the kobject and the name for the kobject
|
This sets up the parent of the kobject and the name for the kobject
|
||||||
properly. If the kobject is to be associated with a specific kset,
|
properly. If the kobject is to be associated with a specific kset,
|
||||||
|
@ -172,13 +172,13 @@ call to kobject_uevent()::
|
||||||
|
|
||||||
int kobject_uevent(struct kobject *kobj, enum kobject_action action);
|
int kobject_uevent(struct kobject *kobj, enum kobject_action action);
|
||||||
|
|
||||||
Use the KOBJ_ADD action for when the kobject is first added to the kernel.
|
Use the **KOBJ_ADD** action for when the kobject is first added to the kernel.
|
||||||
This should be done only after any attributes or children of the kobject
|
This should be done only after any attributes or children of the kobject
|
||||||
have been initialized properly, as userspace will instantly start to look
|
have been initialized properly, as userspace will instantly start to look
|
||||||
for them when this call happens.
|
for them when this call happens.
|
||||||
|
|
||||||
When the kobject is removed from the kernel (details on how to do that are
|
When the kobject is removed from the kernel (details on how to do that are
|
||||||
below), the uevent for KOBJ_REMOVE will be automatically created by the
|
below), the uevent for **KOBJ_REMOVE** will be automatically created by the
|
||||||
kobject core, so the caller does not have to worry about doing that by
|
kobject core, so the caller does not have to worry about doing that by
|
||||||
hand.
|
hand.
|
||||||
|
|
||||||
|
@ -238,7 +238,7 @@ Both types of attributes used here, with a kobject that has been created
|
||||||
with the kobject_create_and_add(), can be of type kobj_attribute, so no
|
with the kobject_create_and_add(), can be of type kobj_attribute, so no
|
||||||
special custom attribute is needed to be created.
|
special custom attribute is needed to be created.
|
||||||
|
|
||||||
See the example module, samples/kobject/kobject-example.c for an
|
See the example module, ``samples/kobject/kobject-example.c`` for an
|
||||||
implementation of a simple kobject and attributes.
|
implementation of a simple kobject and attributes.
|
||||||
|
|
||||||
|
|
||||||
|
@ -270,10 +270,10 @@ such a method has a form like::
|
||||||
|
|
||||||
void my_object_release(struct kobject *kobj)
|
void my_object_release(struct kobject *kobj)
|
||||||
{
|
{
|
||||||
struct my_object *mine = container_of(kobj, struct my_object, kobj);
|
struct my_object *mine = container_of(kobj, struct my_object, kobj);
|
||||||
|
|
||||||
/* Perform any additional cleanup on this object, then... */
|
/* Perform any additional cleanup on this object, then... */
|
||||||
kfree(mine);
|
kfree(mine);
|
||||||
}
|
}
|
||||||
|
|
||||||
One important point cannot be overstated: every kobject must have a
|
One important point cannot be overstated: every kobject must have a
|
||||||
|
@ -297,11 +297,11 @@ instead, it is associated with the ktype. So let us introduce struct
|
||||||
kobj_type::
|
kobj_type::
|
||||||
|
|
||||||
struct kobj_type {
|
struct kobj_type {
|
||||||
void (*release)(struct kobject *kobj);
|
void (*release)(struct kobject *kobj);
|
||||||
const struct sysfs_ops *sysfs_ops;
|
const struct sysfs_ops *sysfs_ops;
|
||||||
struct attribute **default_attrs;
|
struct attribute **default_attrs;
|
||||||
const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
|
const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
|
||||||
const void *(*namespace)(struct kobject *kobj);
|
const void *(*namespace)(struct kobject *kobj);
|
||||||
};
|
};
|
||||||
|
|
||||||
This structure is used to describe a particular type of kobject (or, more
|
This structure is used to describe a particular type of kobject (or, more
|
||||||
|
@ -352,8 +352,8 @@ created and never declared statically or on the stack. To create a new
|
||||||
kset use::
|
kset use::
|
||||||
|
|
||||||
struct kset *kset_create_and_add(const char *name,
|
struct kset *kset_create_and_add(const char *name,
|
||||||
struct kset_uevent_ops *u,
|
struct kset_uevent_ops *u,
|
||||||
struct kobject *parent);
|
struct kobject *parent);
|
||||||
|
|
||||||
When you are finished with the kset, call::
|
When you are finished with the kset, call::
|
||||||
|
|
||||||
|
@ -365,16 +365,16 @@ Because other references to the kset may still exist, the release may happen
|
||||||
after kset_unregister() returns.
|
after kset_unregister() returns.
|
||||||
|
|
||||||
An example of using a kset can be seen in the
|
An example of using a kset can be seen in the
|
||||||
samples/kobject/kset-example.c file in the kernel tree.
|
``samples/kobject/kset-example.c`` file in the kernel tree.
|
||||||
|
|
||||||
If a kset wishes to control the uevent operations of the kobjects
|
If a kset wishes to control the uevent operations of the kobjects
|
||||||
associated with it, it can use the struct kset_uevent_ops to handle it::
|
associated with it, it can use the struct kset_uevent_ops to handle it::
|
||||||
|
|
||||||
struct kset_uevent_ops {
|
struct kset_uevent_ops {
|
||||||
int (*filter)(struct kset *kset, struct kobject *kobj);
|
int (*filter)(struct kset *kset, struct kobject *kobj);
|
||||||
const char *(*name)(struct kset *kset, struct kobject *kobj);
|
const char *(*name)(struct kset *kset, struct kobject *kobj);
|
||||||
int (*uevent)(struct kset *kset, struct kobject *kobj,
|
int (*uevent)(struct kset *kset, struct kobject *kobj,
|
||||||
struct kobj_uevent_env *env);
|
struct kobj_uevent_env *env);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -408,8 +408,8 @@ Kobject removal
|
||||||
After a kobject has been registered with the kobject core successfully, it
|
After a kobject has been registered with the kobject core successfully, it
|
||||||
must be cleaned up when the code is finished with it. To do that, call
|
must be cleaned up when the code is finished with it. To do that, call
|
||||||
kobject_put(). By doing this, the kobject core will automatically clean up
|
kobject_put(). By doing this, the kobject core will automatically clean up
|
||||||
all of the memory allocated by this kobject. If a KOBJ_ADD uevent has been
|
all of the memory allocated by this kobject. If a ``KOBJ_ADD`` uevent has been
|
||||||
sent for the object, a corresponding KOBJ_REMOVE uevent will be sent, and
|
sent for the object, a corresponding ``KOBJ_REMOVE`` uevent will be sent, and
|
||||||
any other sysfs housekeeping will be handled for the caller properly.
|
any other sysfs housekeeping will be handled for the caller properly.
|
||||||
|
|
||||||
If you need to do a two-stage delete of the kobject (say you are not
|
If you need to do a two-stage delete of the kobject (say you are not
|
||||||
|
@ -430,5 +430,5 @@ Example code to copy from
|
||||||
=========================
|
=========================
|
||||||
|
|
||||||
For a more complete example of using ksets and kobjects properly, see the
|
For a more complete example of using ksets and kobjects properly, see the
|
||||||
example programs samples/kobject/{kobject-example.c,kset-example.c},
|
example programs ``samples/kobject/{kobject-example.c,kset-example.c}``,
|
||||||
which will be built as loadable modules if you select CONFIG_SAMPLE_KOBJECT.
|
which will be built as loadable modules if you select ``CONFIG_SAMPLE_KOBJECT``.
|
|
@ -73,6 +73,9 @@ File Mapping and Page Cache
|
||||||
.. kernel-doc:: mm/truncate.c
|
.. kernel-doc:: mm/truncate.c
|
||||||
:export:
|
:export:
|
||||||
|
|
||||||
|
.. kernel-doc:: include/linux/pagemap.h
|
||||||
|
:internal:
|
||||||
|
|
||||||
Memory pools
|
Memory pools
|
||||||
============
|
============
|
||||||
|
|
||||||
|
|
|
@ -52,8 +52,22 @@ Which flags are set by each wrapper
|
||||||
|
|
||||||
For these pin_user_pages*() functions, FOLL_PIN is OR'd in with whatever gup
|
For these pin_user_pages*() functions, FOLL_PIN is OR'd in with whatever gup
|
||||||
flags the caller provides. The caller is required to pass in a non-null struct
|
flags the caller provides. The caller is required to pass in a non-null struct
|
||||||
pages* array, and the function then pin pages by incrementing each by a special
|
pages* array, and the function then pins pages by incrementing each by a special
|
||||||
value. For now, that value is +1, just like get_user_pages*().::
|
value: GUP_PIN_COUNTING_BIAS.
|
||||||
|
|
||||||
|
For huge pages (and in fact, any compound page of more than 2 pages), the
|
||||||
|
GUP_PIN_COUNTING_BIAS scheme is not used. Instead, an exact form of pin counting
|
||||||
|
is achieved, by using the 3rd struct page in the compound page. A new struct
|
||||||
|
page field, hpage_pinned_refcount, has been added in order to support this.
|
||||||
|
|
||||||
|
This approach for compound pages avoids the counting upper limit problems that
|
||||||
|
are discussed below. Those limitations would have been aggravated severely by
|
||||||
|
huge pages, because each tail page adds a refcount to the head page. And in
|
||||||
|
fact, testing revealed that, without a separate hpage_pinned_refcount field,
|
||||||
|
page overflows were seen in some huge page stress tests.
|
||||||
|
|
||||||
|
This also means that huge pages and compound pages (of order > 1) do not suffer
|
||||||
|
from the false positives problem that is mentioned below.::
|
||||||
|
|
||||||
Function
|
Function
|
||||||
--------
|
--------
|
||||||
|
@ -99,27 +113,6 @@ pages:
|
||||||
This also leads to limitations: there are only 31-10==21 bits available for a
|
This also leads to limitations: there are only 31-10==21 bits available for a
|
||||||
counter that increments 10 bits at a time.
|
counter that increments 10 bits at a time.
|
||||||
|
|
||||||
TODO: for 1GB and larger huge pages, this is cutting it close. That's because
|
|
||||||
when pin_user_pages() follows such pages, it increments the head page by "1"
|
|
||||||
(where "1" used to mean "+1" for get_user_pages(), but now means "+1024" for
|
|
||||||
pin_user_pages()) for each tail page. So if you have a 1GB huge page:
|
|
||||||
|
|
||||||
* There are 256K (18 bits) worth of 4 KB tail pages.
|
|
||||||
* There are 21 bits available to count up via GUP_PIN_COUNTING_BIAS (that is,
|
|
||||||
10 bits at a time)
|
|
||||||
* There are 21 - 18 == 3 bits available to count. Except that there aren't,
|
|
||||||
because you need to allow for a few normal get_page() calls on the head page,
|
|
||||||
as well. Fortunately, the approach of using addition, rather than "hard"
|
|
||||||
bitfields, within page->_refcount, allows for sharing these bits gracefully.
|
|
||||||
But we're still looking at about 8 references.
|
|
||||||
|
|
||||||
This, however, is a missing feature more than anything else, because it's easily
|
|
||||||
solved by addressing an obvious inefficiency in the original get_user_pages()
|
|
||||||
approach of retrieving pages: stop treating all the pages as if they were
|
|
||||||
PAGE_SIZE. Retrieve huge pages as huge pages. The callers need to be aware of
|
|
||||||
this, so some work is required. Once that's in place, this limitation mostly
|
|
||||||
disappears from view, because there will be ample refcounting range available.
|
|
||||||
|
|
||||||
* Callers must specifically request "dma-pinned tracking of pages". In other
|
* Callers must specifically request "dma-pinned tracking of pages". In other
|
||||||
words, just calling get_user_pages() will not suffice; a new set of functions,
|
words, just calling get_user_pages() will not suffice; a new set of functions,
|
||||||
pin_user_page() and related, must be used.
|
pin_user_page() and related, must be used.
|
||||||
|
@ -173,8 +166,8 @@ CASE 4: Pinning for struct page manipulation only
|
||||||
-------------------------------------------------
|
-------------------------------------------------
|
||||||
Here, normal GUP calls are sufficient, so neither flag needs to be set.
|
Here, normal GUP calls are sufficient, so neither flag needs to be set.
|
||||||
|
|
||||||
page_dma_pinned(): the whole point of pinning
|
page_maybe_dma_pinned(): the whole point of pinning
|
||||||
=============================================
|
===================================================
|
||||||
|
|
||||||
The whole point of marking pages as "DMA-pinned" or "gup-pinned" is to be able
|
The whole point of marking pages as "DMA-pinned" or "gup-pinned" is to be able
|
||||||
to query, "is this page DMA-pinned?" That allows code such as page_mkclean()
|
to query, "is this page DMA-pinned?" That allows code such as page_mkclean()
|
||||||
|
@ -186,7 +179,7 @@ and debates (see the References at the end of this document). It's a TODO item
|
||||||
here: fill in the details once that's worked out. Meanwhile, it's safe to say
|
here: fill in the details once that's worked out. Meanwhile, it's safe to say
|
||||||
that having this available: ::
|
that having this available: ::
|
||||||
|
|
||||||
static inline bool page_dma_pinned(struct page *page)
|
static inline bool page_maybe_dma_pinned(struct page *page)
|
||||||
|
|
||||||
...is a prerequisite to solving the long-running gup+DMA problem.
|
...is a prerequisite to solving the long-running gup+DMA problem.
|
||||||
|
|
||||||
|
@ -215,12 +208,42 @@ has the following new calls to exercise the new pin*() wrapper functions:
|
||||||
You can monitor how many total dma-pinned pages have been acquired and released
|
You can monitor how many total dma-pinned pages have been acquired and released
|
||||||
since the system was booted, via two new /proc/vmstat entries: ::
|
since the system was booted, via two new /proc/vmstat entries: ::
|
||||||
|
|
||||||
/proc/vmstat/nr_foll_pin_requested
|
/proc/vmstat/nr_foll_pin_acquired
|
||||||
/proc/vmstat/nr_foll_pin_requested
|
/proc/vmstat/nr_foll_pin_released
|
||||||
|
|
||||||
Those are both going to show zero, unless CONFIG_DEBUG_VM is set. This is
|
Under normal conditions, these two values will be equal unless there are any
|
||||||
because there is a noticeable performance drop in unpin_user_page(), when they
|
long-term [R]DMA pins in place, or during pin/unpin transitions.
|
||||||
are activated.
|
|
||||||
|
* nr_foll_pin_acquired: This is the number of logical pins that have been
|
||||||
|
acquired since the system was powered on. For huge pages, the head page is
|
||||||
|
pinned once for each page (head page and each tail page) within the huge page.
|
||||||
|
This follows the same sort of behavior that get_user_pages() uses for huge
|
||||||
|
pages: the head page is refcounted once for each tail or head page in the huge
|
||||||
|
page, when get_user_pages() is applied to a huge page.
|
||||||
|
|
||||||
|
* nr_foll_pin_released: The number of logical pins that have been released since
|
||||||
|
the system was powered on. Note that pages are released (unpinned) on a
|
||||||
|
PAGE_SIZE granularity, even if the original pin was applied to a huge page.
|
||||||
|
Becaused of the pin count behavior described above in "nr_foll_pin_acquired",
|
||||||
|
the accounting balances out, so that after doing this::
|
||||||
|
|
||||||
|
pin_user_pages(huge_page);
|
||||||
|
for (each page in huge_page)
|
||||||
|
unpin_user_page(page);
|
||||||
|
|
||||||
|
...the following is expected::
|
||||||
|
|
||||||
|
nr_foll_pin_released == nr_foll_pin_acquired
|
||||||
|
|
||||||
|
(...unless it was already out of balance due to a long-term RDMA pin being in
|
||||||
|
place.)
|
||||||
|
|
||||||
|
Other diagnostics
|
||||||
|
=================
|
||||||
|
|
||||||
|
dump_page() has been enhanced slightly, to handle these new counting fields, and
|
||||||
|
to better report on compound pages in general. Specifically, for compound pages
|
||||||
|
with order > 1, the exact (hpage_pinned_refcount) pincount is reported.
|
||||||
|
|
||||||
References
|
References
|
||||||
==========
|
==========
|
||||||
|
@ -228,5 +251,6 @@ References
|
||||||
* `Some slow progress on get_user_pages() (Apr 2, 2019) <https://lwn.net/Articles/784574/>`_
|
* `Some slow progress on get_user_pages() (Apr 2, 2019) <https://lwn.net/Articles/784574/>`_
|
||||||
* `DMA and get_user_pages() (LPC: Dec 12, 2018) <https://lwn.net/Articles/774411/>`_
|
* `DMA and get_user_pages() (LPC: Dec 12, 2018) <https://lwn.net/Articles/774411/>`_
|
||||||
* `The trouble with get_user_pages() (Apr 30, 2018) <https://lwn.net/Articles/753027/>`_
|
* `The trouble with get_user_pages() (Apr 30, 2018) <https://lwn.net/Articles/753027/>`_
|
||||||
|
* `LWN kernel index: get_user_pages() <https://lwn.net/Kernel/Index/#Memory_management-get_user_pages>`_
|
||||||
|
|
||||||
John Hubbard, October, 2019
|
John Hubbard, October, 2019
|
||||||
|
|
|
@ -1,38 +0,0 @@
|
||||||
|
|
||||||
PowerNow! and Cool'n'Quiet are AMD names for frequency
|
|
||||||
management capabilities in AMD processors. As the hardware
|
|
||||||
implementation changes in new generations of the processors,
|
|
||||||
there is a different cpu-freq driver for each generation.
|
|
||||||
|
|
||||||
Note that the driver's will not load on the "wrong" hardware,
|
|
||||||
so it is safe to try each driver in turn when in doubt as to
|
|
||||||
which is the correct driver.
|
|
||||||
|
|
||||||
Note that the functionality to change frequency (and voltage)
|
|
||||||
is not available in all processors. The drivers will refuse
|
|
||||||
to load on processors without this capability. The capability
|
|
||||||
is detected with the cpuid instruction.
|
|
||||||
|
|
||||||
The drivers use BIOS supplied tables to obtain frequency and
|
|
||||||
voltage information appropriate for a particular platform.
|
|
||||||
Frequency transitions will be unavailable if the BIOS does
|
|
||||||
not supply these tables.
|
|
||||||
|
|
||||||
6th Generation: powernow-k6
|
|
||||||
|
|
||||||
7th Generation: powernow-k7: Athlon, Duron, Geode.
|
|
||||||
|
|
||||||
8th Generation: powernow-k8: Athlon, Athlon 64, Opteron, Sempron.
|
|
||||||
Documentation on this functionality in 8th generation processors
|
|
||||||
is available in the "BIOS and Kernel Developer's Guide", publication
|
|
||||||
26094, in chapter 9, available for download from www.amd.com.
|
|
||||||
|
|
||||||
BIOS supplied data, for powernow-k7 and for powernow-k8, may be
|
|
||||||
from either the PSB table or from ACPI objects. The ACPI support
|
|
||||||
is only available if the kernel config sets CONFIG_ACPI_PROCESSOR.
|
|
||||||
The powernow-k8 driver will attempt to use ACPI if so configured,
|
|
||||||
and fall back to PST if that fails.
|
|
||||||
The powernow-k7 driver will try to use the PSB support first, and
|
|
||||||
fall back to ACPI if the PSB support fails. A module parameter,
|
|
||||||
acpi_force, is provided to force ACPI support to be used instead
|
|
||||||
of PSB support.
|
|
|
@ -1,31 +1,23 @@
|
||||||
CPU frequency and voltage scaling code in the Linux(TM) kernel
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
=============================================================
|
||||||
|
General description of the CPUFreq core and CPUFreq notifiers
|
||||||
|
=============================================================
|
||||||
|
|
||||||
L i n u x C P U F r e q
|
Authors:
|
||||||
|
- Dominik Brodowski <linux@brodo.de>
|
||||||
|
- David Kimdon <dwhedon@debian.org>
|
||||||
|
- Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
- Viresh Kumar <viresh.kumar@linaro.org>
|
||||||
|
|
||||||
C P U F r e q C o r e
|
.. Contents:
|
||||||
|
|
||||||
|
1. CPUFreq core and interfaces
|
||||||
Dominik Brodowski <linux@brodo.de>
|
2. CPUFreq notifiers
|
||||||
David Kimdon <dwhedon@debian.org>
|
3. CPUFreq Table Generation with Operating Performance Point (OPP)
|
||||||
Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
||||||
Viresh Kumar <viresh.kumar@linaro.org>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Clock scaling allows you to change the clock speed of the CPUs on the
|
|
||||||
fly. This is a nice method to save battery power, because the lower
|
|
||||||
the clock speed, the less power the CPU consumes.
|
|
||||||
|
|
||||||
|
|
||||||
Contents:
|
|
||||||
---------
|
|
||||||
1. CPUFreq core and interfaces
|
|
||||||
2. CPUFreq notifiers
|
|
||||||
3. CPUFreq Table Generation with Operating Performance Point (OPP)
|
|
||||||
|
|
||||||
1. General Information
|
1. General Information
|
||||||
=======================
|
======================
|
||||||
|
|
||||||
The CPUFreq core code is located in drivers/cpufreq/cpufreq.c. This
|
The CPUFreq core code is located in drivers/cpufreq/cpufreq.c. This
|
||||||
cpufreq code offers a standardized interface for the CPUFreq
|
cpufreq code offers a standardized interface for the CPUFreq
|
||||||
|
@ -63,7 +55,7 @@ The phase is specified in the second argument to the notifier. The phase is
|
||||||
CPUFREQ_CREATE_POLICY when the policy is first created and it is
|
CPUFREQ_CREATE_POLICY when the policy is first created and it is
|
||||||
CPUFREQ_REMOVE_POLICY when the policy is removed.
|
CPUFREQ_REMOVE_POLICY when the policy is removed.
|
||||||
|
|
||||||
The third argument, a void *pointer, points to a struct cpufreq_policy
|
The third argument, a ``void *pointer``, points to a struct cpufreq_policy
|
||||||
consisting of several values, including min, max (the lower and upper
|
consisting of several values, including min, max (the lower and upper
|
||||||
frequencies (in kHz) of the new policy).
|
frequencies (in kHz) of the new policy).
|
||||||
|
|
||||||
|
@ -80,10 +72,13 @@ CPUFREQ_POSTCHANGE.
|
||||||
|
|
||||||
The third argument is a struct cpufreq_freqs with the following
|
The third argument is a struct cpufreq_freqs with the following
|
||||||
values:
|
values:
|
||||||
cpu - number of the affected CPU
|
|
||||||
old - old frequency
|
===== ===========================
|
||||||
new - new frequency
|
cpu number of the affected CPU
|
||||||
flags - flags of the cpufreq driver
|
old old frequency
|
||||||
|
new new frequency
|
||||||
|
flags flags of the cpufreq driver
|
||||||
|
===== ===========================
|
||||||
|
|
||||||
3. CPUFreq Table Generation with Operating Performance Point (OPP)
|
3. CPUFreq Table Generation with Operating Performance Point (OPP)
|
||||||
==================================================================
|
==================================================================
|
||||||
|
@ -94,9 +89,12 @@ dev_pm_opp_init_cpufreq_table -
|
||||||
the OPP layer's internal information about the available frequencies
|
the OPP layer's internal information about the available frequencies
|
||||||
into a format readily providable to cpufreq.
|
into a format readily providable to cpufreq.
|
||||||
|
|
||||||
WARNING: Do not use this function in interrupt context.
|
.. Warning::
|
||||||
|
|
||||||
|
Do not use this function in interrupt context.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
Example:
|
|
||||||
soc_pm_init()
|
soc_pm_init()
|
||||||
{
|
{
|
||||||
/* Do things */
|
/* Do things */
|
||||||
|
@ -106,7 +104,10 @@ dev_pm_opp_init_cpufreq_table -
|
||||||
/* Do other things */
|
/* Do other things */
|
||||||
}
|
}
|
||||||
|
|
||||||
NOTE: This function is available only if CONFIG_CPU_FREQ is enabled in
|
.. note::
|
||||||
addition to CONFIG_PM_OPP.
|
|
||||||
|
|
||||||
dev_pm_opp_free_cpufreq_table - Free up the table allocated by dev_pm_opp_init_cpufreq_table
|
This function is available only if CONFIG_CPU_FREQ is enabled in
|
||||||
|
addition to CONFIG_PM_OPP.
|
||||||
|
|
||||||
|
dev_pm_opp_free_cpufreq_table
|
||||||
|
Free up the table allocated by dev_pm_opp_init_cpufreq_table
|
|
@ -1,35 +1,27 @@
|
||||||
CPU frequency and voltage scaling code in the Linux(TM) kernel
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
===============================================
|
||||||
|
How to Implement a new CPUFreq Processor Driver
|
||||||
|
===============================================
|
||||||
|
|
||||||
|
Authors:
|
||||||
|
|
||||||
|
|
||||||
L i n u x C P U F r e q
|
- Dominik Brodowski <linux@brodo.de>
|
||||||
|
- Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
- Viresh Kumar <viresh.kumar@linaro.org>
|
||||||
|
|
||||||
C P U D r i v e r s
|
.. Contents
|
||||||
|
|
||||||
- information for developers -
|
1. What To Do?
|
||||||
|
1.1 Initialization
|
||||||
|
1.2 Per-CPU Initialization
|
||||||
Dominik Brodowski <linux@brodo.de>
|
1.3 verify
|
||||||
Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
1.4 target/target_index or setpolicy?
|
||||||
Viresh Kumar <viresh.kumar@linaro.org>
|
1.5 target/target_index
|
||||||
|
1.6 setpolicy
|
||||||
|
1.7 get_intermediate and target_intermediate
|
||||||
|
2. Frequency Table Helpers
|
||||||
Clock scaling allows you to change the clock speed of the CPUs on the
|
|
||||||
fly. This is a nice method to save battery power, because the lower
|
|
||||||
the clock speed, the less power the CPU consumes.
|
|
||||||
|
|
||||||
|
|
||||||
Contents:
|
|
||||||
---------
|
|
||||||
1. What To Do?
|
|
||||||
1.1 Initialization
|
|
||||||
1.2 Per-CPU Initialization
|
|
||||||
1.3 verify
|
|
||||||
1.4 target/target_index or setpolicy?
|
|
||||||
1.5 target/target_index
|
|
||||||
1.6 setpolicy
|
|
||||||
1.7 get_intermediate and target_intermediate
|
|
||||||
2. Frequency Table Helpers
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -49,7 +41,7 @@ function check whether this kernel runs on the right CPU and the right
|
||||||
chipset. If so, register a struct cpufreq_driver with the CPUfreq core
|
chipset. If so, register a struct cpufreq_driver with the CPUfreq core
|
||||||
using cpufreq_register_driver()
|
using cpufreq_register_driver()
|
||||||
|
|
||||||
What shall this struct cpufreq_driver contain?
|
What shall this struct cpufreq_driver contain?
|
||||||
|
|
||||||
.name - The name of this driver.
|
.name - The name of this driver.
|
||||||
|
|
||||||
|
@ -108,37 +100,42 @@ Whenever a new CPU is registered with the device model, or after the
|
||||||
cpufreq driver registers itself, the per-policy initialization function
|
cpufreq driver registers itself, the per-policy initialization function
|
||||||
cpufreq_driver.init is called if no cpufreq policy existed for the CPU.
|
cpufreq_driver.init is called if no cpufreq policy existed for the CPU.
|
||||||
Note that the .init() and .exit() routines are called only once for the
|
Note that the .init() and .exit() routines are called only once for the
|
||||||
policy and not for each CPU managed by the policy. It takes a struct
|
policy and not for each CPU managed by the policy. It takes a ``struct
|
||||||
cpufreq_policy *policy as argument. What to do now?
|
cpufreq_policy *policy`` as argument. What to do now?
|
||||||
|
|
||||||
If necessary, activate the CPUfreq support on your CPU.
|
If necessary, activate the CPUfreq support on your CPU.
|
||||||
|
|
||||||
Then, the driver must fill in the following values:
|
Then, the driver must fill in the following values:
|
||||||
|
|
||||||
policy->cpuinfo.min_freq _and_
|
+-----------------------------------+--------------------------------------+
|
||||||
policy->cpuinfo.max_freq - the minimum and maximum frequency
|
|policy->cpuinfo.min_freq _and_ | |
|
||||||
(in kHz) which is supported by
|
|policy->cpuinfo.max_freq | the minimum and maximum frequency |
|
||||||
this CPU
|
| | (in kHz) which is supported by |
|
||||||
policy->cpuinfo.transition_latency the time it takes on this CPU to
|
| | this CPU |
|
||||||
switch between two frequencies in
|
+-----------------------------------+--------------------------------------+
|
||||||
nanoseconds (if appropriate, else
|
|policy->cpuinfo.transition_latency | the time it takes on this CPU to |
|
||||||
specify CPUFREQ_ETERNAL)
|
| | switch between two frequencies in |
|
||||||
|
| | nanoseconds (if appropriate, else |
|
||||||
policy->cur The current operating frequency of
|
| | specify CPUFREQ_ETERNAL) |
|
||||||
this CPU (if appropriate)
|
+-----------------------------------+--------------------------------------+
|
||||||
policy->min,
|
|policy->cur | The current operating frequency of |
|
||||||
policy->max,
|
| | this CPU (if appropriate) |
|
||||||
policy->policy and, if necessary,
|
+-----------------------------------+--------------------------------------+
|
||||||
policy->governor must contain the "default policy" for
|
|policy->min, | |
|
||||||
this CPU. A few moments later,
|
|policy->max, | |
|
||||||
cpufreq_driver.verify and either
|
|policy->policy and, if necessary, | |
|
||||||
cpufreq_driver.setpolicy or
|
|policy->governor | must contain the "default policy" for|
|
||||||
cpufreq_driver.target/target_index is called
|
| | this CPU. A few moments later, |
|
||||||
with these values.
|
| | cpufreq_driver.verify and either |
|
||||||
policy->cpus Update this with the masks of the
|
| | cpufreq_driver.setpolicy or |
|
||||||
(online + offline) CPUs that do DVFS
|
| | cpufreq_driver.target/target_index is|
|
||||||
along with this CPU (i.e. that share
|
| | called with these values. |
|
||||||
clock/voltage rails with it).
|
+-----------------------------------+--------------------------------------+
|
||||||
|
|policy->cpus | Update this with the masks of the |
|
||||||
|
| | (online + offline) CPUs that do DVFS |
|
||||||
|
| | along with this CPU (i.e. that share|
|
||||||
|
| | clock/voltage rails with it). |
|
||||||
|
+-----------------------------------+--------------------------------------+
|
||||||
|
|
||||||
For setting some of these values (cpuinfo.min[max]_freq, policy->min[max]), the
|
For setting some of these values (cpuinfo.min[max]_freq, policy->min[max]), the
|
||||||
frequency table helpers might be helpful. See the section 2 for more information
|
frequency table helpers might be helpful. See the section 2 for more information
|
||||||
|
@ -151,8 +148,8 @@ on them.
|
||||||
When the user decides a new policy (consisting of
|
When the user decides a new policy (consisting of
|
||||||
"policy,governor,min,max") shall be set, this policy must be validated
|
"policy,governor,min,max") shall be set, this policy must be validated
|
||||||
so that incompatible values can be corrected. For verifying these
|
so that incompatible values can be corrected. For verifying these
|
||||||
values cpufreq_verify_within_limits(struct cpufreq_policy *policy,
|
values cpufreq_verify_within_limits(``struct cpufreq_policy *policy``,
|
||||||
unsigned int min_freq, unsigned int max_freq) function might be helpful.
|
``unsigned int min_freq``, ``unsigned int max_freq``) function might be helpful.
|
||||||
See section 2 for details on frequency table helpers.
|
See section 2 for details on frequency table helpers.
|
||||||
|
|
||||||
You need to make sure that at least one valid frequency (or operating
|
You need to make sure that at least one valid frequency (or operating
|
||||||
|
@ -163,7 +160,7 @@ policy->max first, and only if this is no solution, decrease policy->min.
|
||||||
1.4 target or target_index or setpolicy or fast_switch?
|
1.4 target or target_index or setpolicy or fast_switch?
|
||||||
-------------------------------------------------------
|
-------------------------------------------------------
|
||||||
|
|
||||||
Most cpufreq drivers or even most cpu frequency scaling algorithms
|
Most cpufreq drivers or even most cpu frequency scaling algorithms
|
||||||
only allow the CPU frequency to be set to predefined fixed values. For
|
only allow the CPU frequency to be set to predefined fixed values. For
|
||||||
these, you use the ->target(), ->target_index() or ->fast_switch()
|
these, you use the ->target(), ->target_index() or ->fast_switch()
|
||||||
callbacks.
|
callbacks.
|
||||||
|
@ -175,8 +172,8 @@ limits on their own. These shall use the ->setpolicy() callback.
|
||||||
1.5. target/target_index
|
1.5. target/target_index
|
||||||
------------------------
|
------------------------
|
||||||
|
|
||||||
The target_index call has two arguments: struct cpufreq_policy *policy,
|
The target_index call has two arguments: ``struct cpufreq_policy *policy``,
|
||||||
and unsigned int index (into the exposed frequency table).
|
and ``unsigned int`` index (into the exposed frequency table).
|
||||||
|
|
||||||
The CPUfreq driver must set the new frequency when called here. The
|
The CPUfreq driver must set the new frequency when called here. The
|
||||||
actual frequency must be determined by freq_table[index].frequency.
|
actual frequency must be determined by freq_table[index].frequency.
|
||||||
|
@ -184,9 +181,9 @@ actual frequency must be determined by freq_table[index].frequency.
|
||||||
It should always restore to earlier frequency (i.e. policy->restore_freq) in
|
It should always restore to earlier frequency (i.e. policy->restore_freq) in
|
||||||
case of errors, even if we switched to intermediate frequency earlier.
|
case of errors, even if we switched to intermediate frequency earlier.
|
||||||
|
|
||||||
Deprecated:
|
Deprecated
|
||||||
----------
|
----------
|
||||||
The target call has three arguments: struct cpufreq_policy *policy,
|
The target call has three arguments: ``struct cpufreq_policy *policy``,
|
||||||
unsigned int target_frequency, unsigned int relation.
|
unsigned int target_frequency, unsigned int relation.
|
||||||
|
|
||||||
The CPUfreq driver must set the new frequency when called here. The
|
The CPUfreq driver must set the new frequency when called here. The
|
||||||
|
@ -210,14 +207,14 @@ Not all drivers are expected to implement it, as sleeping from within
|
||||||
this callback isn't allowed. This callback must be highly optimized to
|
this callback isn't allowed. This callback must be highly optimized to
|
||||||
do switching as fast as possible.
|
do switching as fast as possible.
|
||||||
|
|
||||||
This function has two arguments: struct cpufreq_policy *policy and
|
This function has two arguments: ``struct cpufreq_policy *policy`` and
|
||||||
unsigned int target_frequency.
|
``unsigned int target_frequency``.
|
||||||
|
|
||||||
|
|
||||||
1.7 setpolicy
|
1.7 setpolicy
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
The setpolicy call only takes a struct cpufreq_policy *policy as
|
The setpolicy call only takes a ``struct cpufreq_policy *policy`` as
|
||||||
argument. You need to set the lower limit of the in-processor or
|
argument. You need to set the lower limit of the in-processor or
|
||||||
in-chipset dynamic frequency switching to policy->min, the upper limit
|
in-chipset dynamic frequency switching to policy->min, the upper limit
|
||||||
to policy->max, and -if supported- select a performance-oriented
|
to policy->max, and -if supported- select a performance-oriented
|
||||||
|
@ -278,10 +275,10 @@ table.
|
||||||
|
|
||||||
cpufreq_for_each_valid_entry(pos, table) - iterates over all entries,
|
cpufreq_for_each_valid_entry(pos, table) - iterates over all entries,
|
||||||
excluding CPUFREQ_ENTRY_INVALID frequencies.
|
excluding CPUFREQ_ENTRY_INVALID frequencies.
|
||||||
Use arguments "pos" - a cpufreq_frequency_table * as a loop cursor and
|
Use arguments "pos" - a ``cpufreq_frequency_table *`` as a loop cursor and
|
||||||
"table" - the cpufreq_frequency_table * you want to iterate over.
|
"table" - the ``cpufreq_frequency_table *`` you want to iterate over.
|
||||||
|
|
||||||
For example:
|
For example::
|
||||||
|
|
||||||
struct cpufreq_frequency_table *pos, *driver_freq_table;
|
struct cpufreq_frequency_table *pos, *driver_freq_table;
|
||||||
|
|
|
@ -1,19 +0,0 @@
|
||||||
|
|
||||||
The cpufreq-nforce2 driver changes the FSB on nVidia nForce2 platforms.
|
|
||||||
|
|
||||||
This works better than on other platforms, because the FSB of the CPU
|
|
||||||
can be controlled independently from the PCI/AGP clock.
|
|
||||||
|
|
||||||
The module has two options:
|
|
||||||
|
|
||||||
fid: multiplier * 10 (for example 8.5 = 85)
|
|
||||||
min_fsb: minimum FSB
|
|
||||||
|
|
||||||
If not set, fid is calculated from the current CPU speed and the FSB.
|
|
||||||
min_fsb defaults to FSB at boot time - 50 MHz.
|
|
||||||
|
|
||||||
IMPORTANT: The available range is limited downwards!
|
|
||||||
Also the minimum available FSB can differ, for systems
|
|
||||||
booting with 200 MHz, 150 should always work.
|
|
||||||
|
|
||||||
|
|
|
@ -1,21 +1,23 @@
|
||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
CPU frequency and voltage scaling statistics in the Linux(TM) kernel
|
==========================================
|
||||||
|
General Description of sysfs CPUFreq Stats
|
||||||
|
==========================================
|
||||||
|
|
||||||
|
information for users
|
||||||
|
|
||||||
|
|
||||||
L i n u x c p u f r e q - s t a t s d r i v e r
|
Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
|
||||||
|
|
||||||
- information for users -
|
.. Contents
|
||||||
|
|
||||||
|
1. Introduction
|
||||||
Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
|
2. Statistics Provided (with example)
|
||||||
|
3. Configuring cpufreq-stats
|
||||||
Contents
|
|
||||||
1. Introduction
|
|
||||||
2. Statistics Provided (with example)
|
|
||||||
3. Configuring cpufreq-stats
|
|
||||||
|
|
||||||
|
|
||||||
1. Introduction
|
1. Introduction
|
||||||
|
===============
|
||||||
|
|
||||||
cpufreq-stats is a driver that provides CPU frequency statistics for each CPU.
|
cpufreq-stats is a driver that provides CPU frequency statistics for each CPU.
|
||||||
These statistics are provided in /sysfs as a bunch of read_only interfaces. This
|
These statistics are provided in /sysfs as a bunch of read_only interfaces. This
|
||||||
|
@ -28,8 +30,10 @@ that may be running on your CPU. So, it will work with any cpufreq_driver.
|
||||||
|
|
||||||
|
|
||||||
2. Statistics Provided (with example)
|
2. Statistics Provided (with example)
|
||||||
|
=====================================
|
||||||
|
|
||||||
cpufreq stats provides following statistics (explained in detail below).
|
cpufreq stats provides following statistics (explained in detail below).
|
||||||
|
|
||||||
- time_in_state
|
- time_in_state
|
||||||
- total_trans
|
- total_trans
|
||||||
- trans_table
|
- trans_table
|
||||||
|
@ -39,53 +43,57 @@ All the statistics will be from the time the stats driver has been inserted
|
||||||
statistic is done. Obviously, stats driver will not have any information
|
statistic is done. Obviously, stats driver will not have any information
|
||||||
about the frequency transitions before the stats driver insertion.
|
about the frequency transitions before the stats driver insertion.
|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
::
|
||||||
<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # ls -l
|
|
||||||
total 0
|
<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # ls -l
|
||||||
drwxr-xr-x 2 root root 0 May 14 16:06 .
|
total 0
|
||||||
drwxr-xr-x 3 root root 0 May 14 15:58 ..
|
drwxr-xr-x 2 root root 0 May 14 16:06 .
|
||||||
--w------- 1 root root 4096 May 14 16:06 reset
|
drwxr-xr-x 3 root root 0 May 14 15:58 ..
|
||||||
-r--r--r-- 1 root root 4096 May 14 16:06 time_in_state
|
--w------- 1 root root 4096 May 14 16:06 reset
|
||||||
-r--r--r-- 1 root root 4096 May 14 16:06 total_trans
|
-r--r--r-- 1 root root 4096 May 14 16:06 time_in_state
|
||||||
-r--r--r-- 1 root root 4096 May 14 16:06 trans_table
|
-r--r--r-- 1 root root 4096 May 14 16:06 total_trans
|
||||||
--------------------------------------------------------------------------------
|
-r--r--r-- 1 root root 4096 May 14 16:06 trans_table
|
||||||
|
|
||||||
|
- **reset**
|
||||||
|
|
||||||
- reset
|
|
||||||
Write-only attribute that can be used to reset the stat counters. This can be
|
Write-only attribute that can be used to reset the stat counters. This can be
|
||||||
useful for evaluating system behaviour under different governors without the
|
useful for evaluating system behaviour under different governors without the
|
||||||
need for a reboot.
|
need for a reboot.
|
||||||
|
|
||||||
- time_in_state
|
- **time_in_state**
|
||||||
|
|
||||||
This gives the amount of time spent in each of the frequencies supported by
|
This gives the amount of time spent in each of the frequencies supported by
|
||||||
this CPU. The cat output will have "<frequency> <time>" pair in each line, which
|
this CPU. The cat output will have "<frequency> <time>" pair in each line, which
|
||||||
will mean this CPU spent <time> usertime units of time at <frequency>. Output
|
will mean this CPU spent <time> usertime units of time at <frequency>. Output
|
||||||
will have one line for each of the supported frequencies. usertime units here
|
will have one line for each of the supported frequencies. usertime units here
|
||||||
is 10mS (similar to other time exported in /proc).
|
is 10mS (similar to other time exported in /proc).
|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
::
|
||||||
<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # cat time_in_state
|
|
||||||
3600000 2089
|
<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # cat time_in_state
|
||||||
3400000 136
|
3600000 2089
|
||||||
3200000 34
|
3400000 136
|
||||||
3000000 67
|
3200000 34
|
||||||
2800000 172488
|
3000000 67
|
||||||
--------------------------------------------------------------------------------
|
2800000 172488
|
||||||
|
|
||||||
|
|
||||||
- total_trans
|
- **total_trans**
|
||||||
This gives the total number of frequency transitions on this CPU. The cat
|
|
||||||
|
This gives the total number of frequency transitions on this CPU. The cat
|
||||||
output will have a single count which is the total number of frequency
|
output will have a single count which is the total number of frequency
|
||||||
transitions.
|
transitions.
|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
::
|
||||||
<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # cat total_trans
|
|
||||||
20
|
<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # cat total_trans
|
||||||
--------------------------------------------------------------------------------
|
20
|
||||||
|
|
||||||
|
- **trans_table**
|
||||||
|
|
||||||
- trans_table
|
|
||||||
This will give a fine grained information about all the CPU frequency
|
This will give a fine grained information about all the CPU frequency
|
||||||
transitions. The cat output here is a two dimensional matrix, where an entry
|
transitions. The cat output here is a two dimensional matrix, where an entry
|
||||||
<i,j> (row i, column j) represents the count of number of transitions from
|
<i,j> (row i, column j) represents the count of number of transitions from
|
||||||
Freq_i to Freq_j. Freq_i rows and Freq_j columns follow the sorting order in
|
Freq_i to Freq_j. Freq_i rows and Freq_j columns follow the sorting order in
|
||||||
which the driver has provided the frequency table initially to the cpufreq core
|
which the driver has provided the frequency table initially to the cpufreq core
|
||||||
and so can be sorted (ascending or descending) or unsorted. The output here
|
and so can be sorted (ascending or descending) or unsorted. The output here
|
||||||
|
@ -95,26 +103,27 @@ readability.
|
||||||
If the transition table is bigger than PAGE_SIZE, reading this will
|
If the transition table is bigger than PAGE_SIZE, reading this will
|
||||||
return an -EFBIG error.
|
return an -EFBIG error.
|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
::
|
||||||
<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # cat trans_table
|
|
||||||
From : To
|
|
||||||
: 3600000 3400000 3200000 3000000 2800000
|
|
||||||
3600000: 0 5 0 0 0
|
|
||||||
3400000: 4 0 2 0 0
|
|
||||||
3200000: 0 1 0 2 0
|
|
||||||
3000000: 0 0 1 0 3
|
|
||||||
2800000: 0 0 0 2 0
|
|
||||||
--------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
<mysystem>:/sys/devices/system/cpu/cpu0/cpufreq/stats # cat trans_table
|
||||||
|
From : To
|
||||||
|
: 3600000 3400000 3200000 3000000 2800000
|
||||||
|
3600000: 0 5 0 0 0
|
||||||
|
3400000: 4 0 2 0 0
|
||||||
|
3200000: 0 1 0 2 0
|
||||||
|
3000000: 0 0 1 0 3
|
||||||
|
2800000: 0 0 0 2 0
|
||||||
|
|
||||||
3. Configuring cpufreq-stats
|
3. Configuring cpufreq-stats
|
||||||
|
============================
|
||||||
|
|
||||||
To configure cpufreq-stats in your kernel
|
To configure cpufreq-stats in your kernel::
|
||||||
Config Main Menu
|
|
||||||
Power management options (ACPI, APM) --->
|
Config Main Menu
|
||||||
CPU Frequency scaling --->
|
Power management options (ACPI, APM) --->
|
||||||
[*] CPU Frequency scaling
|
CPU Frequency scaling --->
|
||||||
[*] CPU frequency translation statistics
|
[*] CPU Frequency scaling
|
||||||
|
[*] CPU frequency translation statistics
|
||||||
|
|
||||||
|
|
||||||
"CPU Frequency scaling" (CONFIG_CPU_FREQ) should be enabled to configure
|
"CPU Frequency scaling" (CONFIG_CPU_FREQ) should be enabled to configure
|
39
Documentation/cpu-freq/index.rst
Normal file
39
Documentation/cpu-freq/index.rst
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
==============================================================================
|
||||||
|
Linux CPUFreq - CPU frequency and voltage scaling code in the Linux(TM) kernel
|
||||||
|
==============================================================================
|
||||||
|
|
||||||
|
Author: Dominik Brodowski <linux@brodo.de>
|
||||||
|
|
||||||
|
Clock scaling allows you to change the clock speed of the CPUs on the
|
||||||
|
fly. This is a nice method to save battery power, because the lower
|
||||||
|
the clock speed, the less power the CPU consumes.
|
||||||
|
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
core
|
||||||
|
cpu-drivers
|
||||||
|
cpufreq-stats
|
||||||
|
|
||||||
|
Mailing List
|
||||||
|
------------
|
||||||
|
There is a CPU frequency changing CVS commit and general list where
|
||||||
|
you can report bugs, problems or submit patches. To post a message,
|
||||||
|
send an email to linux-pm@vger.kernel.org.
|
||||||
|
|
||||||
|
Links
|
||||||
|
-----
|
||||||
|
the FTP archives:
|
||||||
|
* ftp://ftp.linux.org.uk/pub/linux/cpufreq/
|
||||||
|
|
||||||
|
how to access the CVS repository:
|
||||||
|
* http://cvs.arm.linux.org.uk/
|
||||||
|
|
||||||
|
the CPUFreq Mailing list:
|
||||||
|
* http://vger.kernel.org/vger-lists.html#linux-pm
|
||||||
|
|
||||||
|
Clock and voltage scaling for the SA-1100:
|
||||||
|
* http://www.lartmaker.nl/projects/scaling
|
|
@ -1,56 +0,0 @@
|
||||||
CPU frequency and voltage scaling code in the Linux(TM) kernel
|
|
||||||
|
|
||||||
|
|
||||||
L i n u x C P U F r e q
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Dominik Brodowski <linux@brodo.de>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Clock scaling allows you to change the clock speed of the CPUs on the
|
|
||||||
fly. This is a nice method to save battery power, because the lower
|
|
||||||
the clock speed, the less power the CPU consumes.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Documents in this directory:
|
|
||||||
----------------------------
|
|
||||||
|
|
||||||
amd-powernow.txt - AMD powernow driver specific file.
|
|
||||||
|
|
||||||
core.txt - General description of the CPUFreq core and
|
|
||||||
of CPUFreq notifiers.
|
|
||||||
|
|
||||||
cpu-drivers.txt - How to implement a new cpufreq processor driver.
|
|
||||||
|
|
||||||
cpufreq-nforce2.txt - nVidia nForce2 platform specific file.
|
|
||||||
|
|
||||||
cpufreq-stats.txt - General description of sysfs cpufreq stats.
|
|
||||||
|
|
||||||
index.txt - File index, Mailing list and Links (this document)
|
|
||||||
|
|
||||||
pcc-cpufreq.txt - PCC cpufreq driver specific file.
|
|
||||||
|
|
||||||
|
|
||||||
Mailing List
|
|
||||||
------------
|
|
||||||
There is a CPU frequency changing CVS commit and general list where
|
|
||||||
you can report bugs, problems or submit patches. To post a message,
|
|
||||||
send an email to linux-pm@vger.kernel.org.
|
|
||||||
|
|
||||||
Links
|
|
||||||
-----
|
|
||||||
the FTP archives:
|
|
||||||
* ftp://ftp.linux.org.uk/pub/linux/cpufreq/
|
|
||||||
|
|
||||||
how to access the CVS repository:
|
|
||||||
* http://cvs.arm.linux.org.uk/
|
|
||||||
|
|
||||||
the CPUFreq Mailing list:
|
|
||||||
* http://vger.kernel.org/vger-lists.html#linux-pm
|
|
||||||
|
|
||||||
Clock and voltage scaling for the SA-1100:
|
|
||||||
* http://www.lartmaker.nl/projects/scaling
|
|
|
@ -1,207 +0,0 @@
|
||||||
/*
|
|
||||||
* pcc-cpufreq.txt - PCC interface documentation
|
|
||||||
*
|
|
||||||
* Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
|
|
||||||
* Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
|
|
||||||
* Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
|
|
||||||
*
|
|
||||||
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; version 2 of the License.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful, but
|
|
||||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
|
|
||||||
* INFRINGEMENT. See the GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License along
|
|
||||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
* 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
||||||
*
|
|
||||||
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
Processor Clocking Control Driver
|
|
||||||
---------------------------------
|
|
||||||
|
|
||||||
Contents:
|
|
||||||
---------
|
|
||||||
1. Introduction
|
|
||||||
1.1 PCC interface
|
|
||||||
1.1.1 Get Average Frequency
|
|
||||||
1.1.2 Set Desired Frequency
|
|
||||||
1.2 Platforms affected
|
|
||||||
2. Driver and /sys details
|
|
||||||
2.1 scaling_available_frequencies
|
|
||||||
2.2 cpuinfo_transition_latency
|
|
||||||
2.3 cpuinfo_cur_freq
|
|
||||||
2.4 related_cpus
|
|
||||||
3. Caveats
|
|
||||||
|
|
||||||
1. Introduction:
|
|
||||||
----------------
|
|
||||||
Processor Clocking Control (PCC) is an interface between the platform
|
|
||||||
firmware and OSPM. It is a mechanism for coordinating processor
|
|
||||||
performance (ie: frequency) between the platform firmware and the OS.
|
|
||||||
|
|
||||||
The PCC driver (pcc-cpufreq) allows OSPM to take advantage of the PCC
|
|
||||||
interface.
|
|
||||||
|
|
||||||
OS utilizes the PCC interface to inform platform firmware what frequency the
|
|
||||||
OS wants for a logical processor. The platform firmware attempts to achieve
|
|
||||||
the requested frequency. If the request for the target frequency could not be
|
|
||||||
satisfied by platform firmware, then it usually means that power budget
|
|
||||||
conditions are in place, and "power capping" is taking place.
|
|
||||||
|
|
||||||
1.1 PCC interface:
|
|
||||||
------------------
|
|
||||||
The complete PCC specification is available here:
|
|
||||||
http://www.acpica.org/download/Processor-Clocking-Control-v1p0.pdf
|
|
||||||
|
|
||||||
PCC relies on a shared memory region that provides a channel for communication
|
|
||||||
between the OS and platform firmware. PCC also implements a "doorbell" that
|
|
||||||
is used by the OS to inform the platform firmware that a command has been
|
|
||||||
sent.
|
|
||||||
|
|
||||||
The ACPI PCCH() method is used to discover the location of the PCC shared
|
|
||||||
memory region. The shared memory region header contains the "command" and
|
|
||||||
"status" interface. PCCH() also contains details on how to access the platform
|
|
||||||
doorbell.
|
|
||||||
|
|
||||||
The following commands are supported by the PCC interface:
|
|
||||||
* Get Average Frequency
|
|
||||||
* Set Desired Frequency
|
|
||||||
|
|
||||||
The ACPI PCCP() method is implemented for each logical processor and is
|
|
||||||
used to discover the offsets for the input and output buffers in the shared
|
|
||||||
memory region.
|
|
||||||
|
|
||||||
When PCC mode is enabled, the platform will not expose processor performance
|
|
||||||
or throttle states (_PSS, _TSS and related ACPI objects) to OSPM. Therefore,
|
|
||||||
the native P-state driver (such as acpi-cpufreq for Intel, powernow-k8 for
|
|
||||||
AMD) will not load.
|
|
||||||
|
|
||||||
However, OSPM remains in control of policy. The governor (eg: "ondemand")
|
|
||||||
computes the required performance for each processor based on server workload.
|
|
||||||
The PCC driver fills in the command interface, and the input buffer and
|
|
||||||
communicates the request to the platform firmware. The platform firmware is
|
|
||||||
responsible for delivering the requested performance.
|
|
||||||
|
|
||||||
Each PCC command is "global" in scope and can affect all the logical CPUs in
|
|
||||||
the system. Therefore, PCC is capable of performing "group" updates. With PCC
|
|
||||||
the OS is capable of getting/setting the frequency of all the logical CPUs in
|
|
||||||
the system with a single call to the BIOS.
|
|
||||||
|
|
||||||
1.1.1 Get Average Frequency:
|
|
||||||
----------------------------
|
|
||||||
This command is used by the OSPM to query the running frequency of the
|
|
||||||
processor since the last time this command was completed. The output buffer
|
|
||||||
indicates the average unhalted frequency of the logical processor expressed as
|
|
||||||
a percentage of the nominal (ie: maximum) CPU frequency. The output buffer
|
|
||||||
also signifies if the CPU frequency is limited by a power budget condition.
|
|
||||||
|
|
||||||
1.1.2 Set Desired Frequency:
|
|
||||||
----------------------------
|
|
||||||
This command is used by the OSPM to communicate to the platform firmware the
|
|
||||||
desired frequency for a logical processor. The output buffer is currently
|
|
||||||
ignored by OSPM. The next invocation of "Get Average Frequency" will inform
|
|
||||||
OSPM if the desired frequency was achieved or not.
|
|
||||||
|
|
||||||
1.2 Platforms affected:
|
|
||||||
-----------------------
|
|
||||||
The PCC driver will load on any system where the platform firmware:
|
|
||||||
* supports the PCC interface, and the associated PCCH() and PCCP() methods
|
|
||||||
* assumes responsibility for managing the hardware clocking controls in order
|
|
||||||
to deliver the requested processor performance
|
|
||||||
|
|
||||||
Currently, certain HP ProLiant platforms implement the PCC interface. On those
|
|
||||||
platforms PCC is the "default" choice.
|
|
||||||
|
|
||||||
However, it is possible to disable this interface via a BIOS setting. In
|
|
||||||
such an instance, as is also the case on platforms where the PCC interface
|
|
||||||
is not implemented, the PCC driver will fail to load silently.
|
|
||||||
|
|
||||||
2. Driver and /sys details:
|
|
||||||
---------------------------
|
|
||||||
When the driver loads, it merely prints the lowest and the highest CPU
|
|
||||||
frequencies supported by the platform firmware.
|
|
||||||
|
|
||||||
The PCC driver loads with a message such as:
|
|
||||||
pcc-cpufreq: (v1.00.00) driver loaded with frequency limits: 1600 MHz, 2933
|
|
||||||
MHz
|
|
||||||
|
|
||||||
This means that the OPSM can request the CPU to run at any frequency in
|
|
||||||
between the limits (1600 MHz, and 2933 MHz) specified in the message.
|
|
||||||
|
|
||||||
Internally, there is no need for the driver to convert the "target" frequency
|
|
||||||
to a corresponding P-state.
|
|
||||||
|
|
||||||
The VERSION number for the driver will be of the format v.xy.ab.
|
|
||||||
eg: 1.00.02
|
|
||||||
----- --
|
|
||||||
| |
|
|
||||||
| -- this will increase with bug fixes/enhancements to the driver
|
|
||||||
|-- this is the version of the PCC specification the driver adheres to
|
|
||||||
|
|
||||||
|
|
||||||
The following is a brief discussion on some of the fields exported via the
|
|
||||||
/sys filesystem and how their values are affected by the PCC driver:
|
|
||||||
|
|
||||||
2.1 scaling_available_frequencies:
|
|
||||||
----------------------------------
|
|
||||||
scaling_available_frequencies is not created in /sys. No intermediate
|
|
||||||
frequencies need to be listed because the BIOS will try to achieve any
|
|
||||||
frequency, within limits, requested by the governor. A frequency does not have
|
|
||||||
to be strictly associated with a P-state.
|
|
||||||
|
|
||||||
2.2 cpuinfo_transition_latency:
|
|
||||||
-------------------------------
|
|
||||||
The cpuinfo_transition_latency field is 0. The PCC specification does
|
|
||||||
not include a field to expose this value currently.
|
|
||||||
|
|
||||||
2.3 cpuinfo_cur_freq:
|
|
||||||
---------------------
|
|
||||||
A) Often cpuinfo_cur_freq will show a value different than what is declared
|
|
||||||
in the scaling_available_frequencies or scaling_cur_freq, or scaling_max_freq.
|
|
||||||
This is due to "turbo boost" available on recent Intel processors. If certain
|
|
||||||
conditions are met the BIOS can achieve a slightly higher speed than requested
|
|
||||||
by OSPM. An example:
|
|
||||||
|
|
||||||
scaling_cur_freq : 2933000
|
|
||||||
cpuinfo_cur_freq : 3196000
|
|
||||||
|
|
||||||
B) There is a round-off error associated with the cpuinfo_cur_freq value.
|
|
||||||
Since the driver obtains the current frequency as a "percentage" (%) of the
|
|
||||||
nominal frequency from the BIOS, sometimes, the values displayed by
|
|
||||||
scaling_cur_freq and cpuinfo_cur_freq may not match. An example:
|
|
||||||
|
|
||||||
scaling_cur_freq : 1600000
|
|
||||||
cpuinfo_cur_freq : 1583000
|
|
||||||
|
|
||||||
In this example, the nominal frequency is 2933 MHz. The driver obtains the
|
|
||||||
current frequency, cpuinfo_cur_freq, as 54% of the nominal frequency:
|
|
||||||
|
|
||||||
54% of 2933 MHz = 1583 MHz
|
|
||||||
|
|
||||||
Nominal frequency is the maximum frequency of the processor, and it usually
|
|
||||||
corresponds to the frequency of the P0 P-state.
|
|
||||||
|
|
||||||
2.4 related_cpus:
|
|
||||||
-----------------
|
|
||||||
The related_cpus field is identical to affected_cpus.
|
|
||||||
|
|
||||||
affected_cpus : 4
|
|
||||||
related_cpus : 4
|
|
||||||
|
|
||||||
Currently, the PCC driver does not evaluate _PSD. The platforms that support
|
|
||||||
PCC do not implement SW_ALL. So OSPM doesn't need to perform any coordination
|
|
||||||
to ensure that the same frequency is requested of all dependent CPUs.
|
|
||||||
|
|
||||||
3. Caveats:
|
|
||||||
-----------
|
|
||||||
The "cpufreq_stats" module in its present form cannot be loaded and
|
|
||||||
expected to work with the PCC driver. Since the "cpufreq_stats" module
|
|
||||||
provides information wrt each P-state, it is not applicable to the PCC driver.
|
|
|
@ -1,22 +0,0 @@
|
||||||
Debugging Modules after 2.6.3
|
|
||||||
-----------------------------
|
|
||||||
|
|
||||||
In almost all distributions, the kernel asks for modules which don't
|
|
||||||
exist, such as "net-pf-10" or whatever. Changing "modprobe -q" to
|
|
||||||
"succeed" in this case is hacky and breaks some setups, and also we
|
|
||||||
want to know if it failed for the fallback code for old aliases in
|
|
||||||
fs/char_dev.c, for example.
|
|
||||||
|
|
||||||
In the past a debugging message which would fill people's logs was
|
|
||||||
emitted. This debugging message has been removed. The correct way
|
|
||||||
of debugging module problems is something like this:
|
|
||||||
|
|
||||||
echo '#! /bin/sh' > /tmp/modprobe
|
|
||||||
echo 'echo "$@" >> /tmp/modprobe.log' >> /tmp/modprobe
|
|
||||||
echo 'exec /sbin/modprobe "$@"' >> /tmp/modprobe
|
|
||||||
chmod a+x /tmp/modprobe
|
|
||||||
echo /tmp/modprobe > /proc/sys/kernel/modprobe
|
|
||||||
|
|
||||||
Note that the above applies only when the *kernel* is requesting
|
|
||||||
that the module be loaded -- it won't have any effect if that module
|
|
||||||
is being loaded explicitly using "modprobe" from userspace.
|
|
|
@ -203,7 +203,7 @@ Cause
|
||||||
may not correctly copy files from sysfs.
|
may not correctly copy files from sysfs.
|
||||||
|
|
||||||
Solution
|
Solution
|
||||||
Use ``cat``' to read ``.gcda`` files and ``cp -d`` to copy links.
|
Use ``cat`` to read ``.gcda`` files and ``cp -d`` to copy links.
|
||||||
Alternatively use the mechanism shown in Appendix B.
|
Alternatively use the mechanism shown in Appendix B.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,8 @@ with the difference that the orphan objects are not freed but only
|
||||||
reported via /sys/kernel/debug/kmemleak. A similar method is used by the
|
reported via /sys/kernel/debug/kmemleak. A similar method is used by the
|
||||||
Valgrind tool (``memcheck --leak-check``) to detect the memory leaks in
|
Valgrind tool (``memcheck --leak-check``) to detect the memory leaks in
|
||||||
user-space applications.
|
user-space applications.
|
||||||
Kmemleak is supported on x86, arm, powerpc, sparc, sh, microblaze, ppc, mips, s390 and tile.
|
Kmemleak is supported on x86, arm, arm64, powerpc, sparc, sh, microblaze, mips,
|
||||||
|
s390, nds32, arc and xtensa.
|
||||||
|
|
||||||
Usage
|
Usage
|
||||||
-----
|
-----
|
||||||
|
|
|
@ -17,14 +17,23 @@ What is KUnit?
|
||||||
==============
|
==============
|
||||||
|
|
||||||
KUnit is a lightweight unit testing and mocking framework for the Linux kernel.
|
KUnit is a lightweight unit testing and mocking framework for the Linux kernel.
|
||||||
These tests are able to be run locally on a developer's workstation without a VM
|
|
||||||
or special hardware.
|
|
||||||
|
|
||||||
KUnit is heavily inspired by JUnit, Python's unittest.mock, and
|
KUnit is heavily inspired by JUnit, Python's unittest.mock, and
|
||||||
Googletest/Googlemock for C++. KUnit provides facilities for defining unit test
|
Googletest/Googlemock for C++. KUnit provides facilities for defining unit test
|
||||||
cases, grouping related test cases into test suites, providing common
|
cases, grouping related test cases into test suites, providing common
|
||||||
infrastructure for running tests, and much more.
|
infrastructure for running tests, and much more.
|
||||||
|
|
||||||
|
KUnit consists of a kernel component, which provides a set of macros for easily
|
||||||
|
writing unit tests. Tests written against KUnit will run on kernel boot if
|
||||||
|
built-in, or when loaded if built as a module. These tests write out results to
|
||||||
|
the kernel log in `TAP <https://testanything.org/>`_ format.
|
||||||
|
|
||||||
|
To make running these tests (and reading the results) easier, KUnit offers
|
||||||
|
:doc:`kunit_tool <kunit-tool>`, which builds a `User Mode Linux
|
||||||
|
<http://user-mode-linux.sourceforge.net>`_ kernel, runs it, and parses the test
|
||||||
|
results. This provides a quick way of running KUnit tests during development,
|
||||||
|
without requiring a virtual machine or separate hardware.
|
||||||
|
|
||||||
Get started now: :doc:`start`
|
Get started now: :doc:`start`
|
||||||
|
|
||||||
Why KUnit?
|
Why KUnit?
|
||||||
|
@ -36,21 +45,20 @@ allow all possible code paths to be tested in the code under test; this is only
|
||||||
possible if the code under test is very small and does not have any external
|
possible if the code under test is very small and does not have any external
|
||||||
dependencies outside of the test's control like hardware.
|
dependencies outside of the test's control like hardware.
|
||||||
|
|
||||||
Outside of KUnit, there are no testing frameworks currently
|
KUnit provides a common framework for unit tests within the kernel.
|
||||||
available for the kernel that do not require installing the kernel on a test
|
|
||||||
machine or in a VM and all require tests to be written in userspace running on
|
|
||||||
the kernel; this is true for Autotest, and kselftest, disqualifying
|
|
||||||
any of them from being considered unit testing frameworks.
|
|
||||||
|
|
||||||
KUnit addresses the problem of being able to run tests without needing a virtual
|
KUnit tests can be run on most architectures, and most tests are architecture
|
||||||
machine or actual hardware with User Mode Linux. User Mode Linux is a Linux
|
independent. All built-in KUnit tests run on kernel startup. Alternatively,
|
||||||
architecture, like ARM or x86; however, unlike other architectures it compiles
|
KUnit and KUnit tests can be built as modules and tests will run when the test
|
||||||
to a standalone program that can be run like any other program directly inside
|
module is loaded.
|
||||||
of a host operating system; to be clear, it does not require any virtualization
|
|
||||||
support; it is just a regular program.
|
|
||||||
|
|
||||||
Alternatively, kunit and kunit tests can be built as modules and tests will
|
.. note::
|
||||||
run when the test module is loaded.
|
|
||||||
|
KUnit can also run tests without needing a virtual machine or actual
|
||||||
|
hardware under User Mode Linux. User Mode Linux is a Linux architecture,
|
||||||
|
like ARM or x86, which compiles the kernel as a Linux executable. KUnit
|
||||||
|
can be used with UML either by building with ``ARCH=um`` (like any other
|
||||||
|
architecture), or by using :doc:`kunit_tool <kunit-tool>`.
|
||||||
|
|
||||||
KUnit is fast. Excluding build time, from invocation to completion KUnit can run
|
KUnit is fast. Excluding build time, from invocation to completion KUnit can run
|
||||||
several dozen tests in only 10 to 20 seconds; this might not sound like a big
|
several dozen tests in only 10 to 20 seconds; this might not sound like a big
|
||||||
|
@ -81,3 +89,5 @@ How do I use it?
|
||||||
* :doc:`start` - for new users of KUnit
|
* :doc:`start` - for new users of KUnit
|
||||||
* :doc:`usage` - for a more detailed explanation of KUnit features
|
* :doc:`usage` - for a more detailed explanation of KUnit features
|
||||||
* :doc:`api/index` - for the list of KUnit APIs used for testing
|
* :doc:`api/index` - for the list of KUnit APIs used for testing
|
||||||
|
* :doc:`kunit-tool` - for more information on the kunit_tool helper script
|
||||||
|
* :doc:`faq` - for answers to some common questions about KUnit
|
||||||
|
|
|
@ -12,6 +12,13 @@ the Linux kernel as UML (`User Mode Linux
|
||||||
<http://user-mode-linux.sourceforge.net/>`_), running KUnit tests, parsing
|
<http://user-mode-linux.sourceforge.net/>`_), running KUnit tests, parsing
|
||||||
the test results and displaying them in a user friendly manner.
|
the test results and displaying them in a user friendly manner.
|
||||||
|
|
||||||
|
kunit_tool addresses the problem of being able to run tests without needing a
|
||||||
|
virtual machine or actual hardware with User Mode Linux. User Mode Linux is a
|
||||||
|
Linux architecture, like ARM or x86; however, unlike other architectures it
|
||||||
|
compiles the kernel as a standalone Linux executable that can be run like any
|
||||||
|
other program directly inside of a host operating system. To be clear, it does
|
||||||
|
not require any virtualization support: it is just a regular program.
|
||||||
|
|
||||||
What is a kunitconfig?
|
What is a kunitconfig?
|
||||||
======================
|
======================
|
||||||
|
|
||||||
|
|
|
@ -9,11 +9,10 @@ Installing dependencies
|
||||||
KUnit has the same dependencies as the Linux kernel. As long as you can build
|
KUnit has the same dependencies as the Linux kernel. As long as you can build
|
||||||
the kernel, you can run KUnit.
|
the kernel, you can run KUnit.
|
||||||
|
|
||||||
KUnit Wrapper
|
Running tests with the KUnit Wrapper
|
||||||
=============
|
====================================
|
||||||
Included with KUnit is a simple Python wrapper that helps format the output to
|
Included with KUnit is a simple Python wrapper which runs tests under User Mode
|
||||||
easily use and read KUnit output. It handles building and running the kernel, as
|
Linux, and formats the test results.
|
||||||
well as formatting the output.
|
|
||||||
|
|
||||||
The wrapper can be run with:
|
The wrapper can be run with:
|
||||||
|
|
||||||
|
@ -21,22 +20,42 @@ The wrapper can be run with:
|
||||||
|
|
||||||
./tools/testing/kunit/kunit.py run --defconfig
|
./tools/testing/kunit/kunit.py run --defconfig
|
||||||
|
|
||||||
For more information on this wrapper (also called kunit_tool) checkout the
|
For more information on this wrapper (also called kunit_tool) check out the
|
||||||
:doc:`kunit-tool` page.
|
:doc:`kunit-tool` page.
|
||||||
|
|
||||||
Creating a .kunitconfig
|
Creating a .kunitconfig
|
||||||
=======================
|
-----------------------
|
||||||
The Python script is a thin wrapper around Kbuild. As such, it needs to be
|
If you want to run a specific set of tests (rather than those listed in the
|
||||||
configured with a ``.kunitconfig`` file. This file essentially contains the
|
KUnit defconfig), you can provide Kconfig options in the ``.kunitconfig`` file.
|
||||||
regular Kernel config, with the specific test targets as well.
|
This file essentially contains the regular Kernel config, with the specific
|
||||||
|
test targets as well. The ``.kunitconfig`` should also contain any other config
|
||||||
|
options required by the tests.
|
||||||
|
|
||||||
|
A good starting point for a ``.kunitconfig`` is the KUnit defconfig:
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
cd $PATH_TO_LINUX_REPO
|
cd $PATH_TO_LINUX_REPO
|
||||||
cp arch/um/configs/kunit_defconfig .kunitconfig
|
cp arch/um/configs/kunit_defconfig .kunitconfig
|
||||||
|
|
||||||
Verifying KUnit Works
|
You can then add any other Kconfig options you wish, e.g.:
|
||||||
---------------------
|
.. code-block:: none
|
||||||
|
|
||||||
|
CONFIG_LIST_KUNIT_TEST=y
|
||||||
|
|
||||||
|
:doc:`kunit_tool <kunit-tool>` will ensure that all config options set in
|
||||||
|
``.kunitconfig`` are set in the kernel ``.config`` before running the tests.
|
||||||
|
It'll warn you if you haven't included the dependencies of the options you're
|
||||||
|
using.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
Note that removing something from the ``.kunitconfig`` will not trigger a
|
||||||
|
rebuild of the ``.config`` file: the configuration is only updated if the
|
||||||
|
``.kunitconfig`` is not a subset of ``.config``. This means that you can use
|
||||||
|
other tools (such as make menuconfig) to adjust other config options.
|
||||||
|
|
||||||
|
|
||||||
|
Running the tests
|
||||||
|
-----------------
|
||||||
|
|
||||||
To make sure that everything is set up correctly, simply invoke the Python
|
To make sure that everything is set up correctly, simply invoke the Python
|
||||||
wrapper from your kernel repo:
|
wrapper from your kernel repo:
|
||||||
|
@ -62,6 +81,41 @@ followed by a list of tests that are run. All of them should be passing.
|
||||||
Because it is building a lot of sources for the first time, the
|
Because it is building a lot of sources for the first time, the
|
||||||
``Building KUnit kernel`` step may take a while.
|
``Building KUnit kernel`` step may take a while.
|
||||||
|
|
||||||
|
Running tests without the KUnit Wrapper
|
||||||
|
=======================================
|
||||||
|
|
||||||
|
If you'd rather not use the KUnit Wrapper (if, for example, you need to
|
||||||
|
integrate with other systems, or use an architecture other than UML), KUnit can
|
||||||
|
be included in any kernel, and the results read out and parsed manually.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
KUnit is not designed for use in a production system, and it's possible that
|
||||||
|
tests may reduce the stability or security of the system.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Configuring the kernel
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
In order to enable KUnit itself, you simply need to enable the ``CONFIG_KUNIT``
|
||||||
|
Kconfig option (it's under Kernel Hacking/Kernel Testing and Coverage in
|
||||||
|
menuconfig). From there, you can enable any KUnit tests you want: they usually
|
||||||
|
have config options ending in ``_KUNIT_TEST``.
|
||||||
|
|
||||||
|
KUnit and KUnit tests can be compiled as modules: in this case the tests in a
|
||||||
|
module will be run when the module is loaded.
|
||||||
|
|
||||||
|
Running the tests
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
Build and run your kernel as usual. Test output will be written to the kernel
|
||||||
|
log in `TAP <https://testanything.org/>`_ format.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
It's possible that there will be other lines and/or data interspersed in the
|
||||||
|
TAP output.
|
||||||
|
|
||||||
|
|
||||||
Writing your first test
|
Writing your first test
|
||||||
=======================
|
=======================
|
||||||
|
|
||||||
|
|
|
@ -551,6 +551,7 @@ options to your ``.config``:
|
||||||
Once the kernel is built and installed, a simple
|
Once the kernel is built and installed, a simple
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
modprobe example-test
|
modprobe example-test
|
||||||
|
|
||||||
...will run the tests.
|
...will run the tests.
|
||||||
|
@ -590,3 +591,17 @@ able to run one test case per invocation.
|
||||||
|
|
||||||
.. TODO(brendanhiggins@google.com): Add an actual example of an architecture
|
.. TODO(brendanhiggins@google.com): Add an actual example of an architecture
|
||||||
dependent KUnit test.
|
dependent KUnit test.
|
||||||
|
|
||||||
|
KUnit debugfs representation
|
||||||
|
============================
|
||||||
|
When kunit test suites are initialized, they create an associated directory
|
||||||
|
in /sys/kernel/debug/kunit/<test-suite>. The directory contains one file
|
||||||
|
|
||||||
|
- results: "cat results" displays results of each test case and the results
|
||||||
|
of the entire suite for the last test run.
|
||||||
|
|
||||||
|
The debugfs representation is primarily of use when kunit test suites are
|
||||||
|
run in a native environment, either as modules or builtin. Having a way
|
||||||
|
to display results like this is valuable as otherwise results can be
|
||||||
|
intermixed with other events in dmesg output. The maximum size of each
|
||||||
|
results file is KUNIT_LOG_SIZE bytes (defined in include/kunit/test.h).
|
||||||
|
|
3
Documentation/devicetree/bindings/.gitignore
vendored
3
Documentation/devicetree/bindings/.gitignore
vendored
|
@ -1,2 +1,3 @@
|
||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
*.example.dts
|
*.example.dts
|
||||||
processed-schema.yaml
|
processed-schema*.yaml
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue