From bdf2b8cbf076568f7d17dc62467348d409142298 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 15 May 2019 14:38:54 +0900 Subject: selftests/ftrace: Add user-memory access syntax testcase Add a user-memory access syntax testcase which checks new user-memory access syntax and ustring type. Link: http://lkml.kernel.org/r/155789873385.26965.9557271156179140676.stgit@devnote2 Acked-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- .../ftrace/test.d/kprobe/kprobe_args_user.tc | 32 ++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc (limited to 'tools/testing') diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc new file mode 100644 index 000000000000..0f60087583d8 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc @@ -0,0 +1,32 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Kprobe event user-memory access + +[ -f kprobe_events ] || exit_unsupported # this is configurable + +grep -q '\$arg' README || exit_unresolved # depends on arch +grep -A10 "fetcharg:" README | grep -q 'ustring' || exit_unsupported +grep -A10 "fetcharg:" README | grep -q '\[u\]' || exit_unsupported + +:;: "user-memory access syntax and ustring working on user memory";: +echo 'p:myevent do_sys_open path=+0($arg2):ustring path2=+u0($arg2):string' \ + > kprobe_events + +grep myevent kprobe_events | \ + grep -q 'path=+0($arg2):ustring path2=+u0($arg2):string' +echo 1 > events/kprobes/myevent/enable +echo > /dev/null +echo 0 > events/kprobes/myevent/enable + +grep myevent trace | grep -q 'path="/dev/null" path2="/dev/null"' + +:;: "user-memory access syntax and ustring not working with kernel memory";: +echo 'p:myevent vfs_symlink path=+0($arg3):ustring path2=+u0($arg3):string' \ + > kprobe_events +echo 1 > events/kprobes/myevent/enable +ln -s foo $TMPDIR/bar +echo 0 > events/kprobes/myevent/enable + +grep myevent trace | grep -q 'path=(fault) path2=(fault)' + +exit 0 -- cgit v1.2.3-55-g7522 From 76ab785e7396796118016c8bc9fd62b58fe422ea Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 23 May 2019 16:30:58 -0600 Subject: NTB: Add ntb_msi_test support to ntb_test When the ntb_msi_test module is available, the test code will trigger each of the interrupts and ensure the corresponding occurrences files gets incremented. Signed-off-by: Logan Gunthorpe Cc: Dave Jiang Cc: Allen Hubbe Signed-off-by: Jon Mason --- tools/testing/selftests/ntb/ntb_test.sh | 54 +++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ntb/ntb_test.sh b/tools/testing/selftests/ntb/ntb_test.sh index 08cbfbbc7029..7df497095b72 100755 --- a/tools/testing/selftests/ntb/ntb_test.sh +++ b/tools/testing/selftests/ntb/ntb_test.sh @@ -87,10 +87,10 @@ set -e function _modprobe() { - modprobe "$@" + modprobe "$@" || return 1 if [[ "$REMOTE_HOST" != "" ]]; then - ssh "$REMOTE_HOST" modprobe "$@" + ssh "$REMOTE_HOST" modprobe "$@" || return 1 fi } @@ -451,6 +451,30 @@ function pingpong_test() echo " Passed" } +function msi_test() +{ + LOC=$1 + REM=$2 + + write_file 1 $LOC/ready + + echo "Running MSI interrupt tests on: $(subdirname $LOC) / $(subdirname $REM)" + + CNT=$(read_file "$LOC/count") + for ((i = 0; i < $CNT; i++)); do + START=$(read_file $REM/../irq${i}_occurrences) + write_file $i $LOC/trigger + END=$(read_file $REM/../irq${i}_occurrences) + + if [[ $(($END - $START)) != 1 ]]; then + echo "MSI did not trigger the interrupt on the remote side!" >&2 + exit 1 + fi + done + + echo " Passed" +} + function perf_test() { USE_DMA=$1 @@ -529,6 +553,29 @@ function ntb_pingpong_tests() _modprobe -r ntb_pingpong } +function ntb_msi_tests() +{ + LOCAL_MSI="$DEBUGFS/ntb_msi_test/$LOCAL_DEV" + REMOTE_MSI="$REMOTE_HOST:$DEBUGFS/ntb_msi_test/$REMOTE_DEV" + + echo "Starting ntb_msi_test tests..." + + if ! _modprobe ntb_msi_test 2> /dev/null; then + echo " Not doing MSI tests seeing the module is not available." + return + fi + + port_test $LOCAL_MSI $REMOTE_MSI + + LOCAL_PEER="$LOCAL_MSI/peer$LOCAL_PIDX" + REMOTE_PEER="$REMOTE_MSI/peer$REMOTE_PIDX" + + msi_test $LOCAL_PEER $REMOTE_PEER + msi_test $REMOTE_PEER $LOCAL_PEER + + _modprobe -r ntb_msi_test +} + function ntb_perf_tests() { LOCAL_PERF="$DEBUGFS/ntb_perf/$LOCAL_DEV" @@ -550,6 +597,7 @@ function cleanup() _modprobe -r ntb_perf 2> /dev/null _modprobe -r ntb_pingpong 2> /dev/null _modprobe -r ntb_transport 2> /dev/null + _modprobe -r ntb_msi_test 2> /dev/null set -e } @@ -586,5 +634,7 @@ ntb_tool_tests echo ntb_pingpong_tests echo +ntb_msi_tests +echo ntb_perf_tests echo -- cgit v1.2.3-55-g7522 From 8d0f1e05ab16c4bd628ddaefd20b94ffb36d799c Mon Sep 17 00:00:00 2001 From: Gustavo Romero Date: Mon, 17 Jun 2019 17:24:58 -0400 Subject: selftests/powerpc: Fix earlyclobber in tm-vmxcopy In some cases, compiler can allocate the same register for operand 'res' and 'vecoutptr', resulting in segfault at 'stxvd2x 40,0,%[vecoutptr]' because base register will contain 1, yielding a false-positive. This is because output 'res' must be marked as an earlyclobber operand so it may not overlap an input operand ('vecoutptr'). Signed-off-by: Gustavo Romero Reviewed-by: Segher Boessenkool Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/tm/tm-vmxcopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c b/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c index fe52811584ae..0815f06f3590 100644 --- a/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c +++ b/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c @@ -79,7 +79,7 @@ int test_vmxcopy() "5:;" "stxvd2x 40,0,%[vecoutptr];" - : [res]"=r"(aborted) + : [res]"=&r"(aborted) : [vecinptr]"r"(&vecin), [vecoutptr]"r"(&vecout), [map]"r"(a) -- cgit v1.2.3-55-g7522 From 6820e565d35084bb9d3a167e990fcb780ccd716f Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Sun, 23 Jun 2019 18:52:00 +0300 Subject: selftests/powerpc: ppc_asm.h: typo in the header guard The guard macro __PPC_ASM_H in the header ppc_asm.h doesn't match the #ifndef macro _PPC_ASM_H. The patch makes them the same. Signed-off-by: Denis Efremov Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h index d2c0a911f55e..2b488b78c4f2 100644 --- a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h +++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _PPC_ASM_H -#define __PPC_ASM_H +#define _PPC_ASM_H #include #ifndef r1 -- cgit v1.2.3-55-g7522 From 1e240e8d4a7d92232b6214e02a0a4197a53afd6c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:08 +0200 Subject: memremap: move dev_pagemap callbacks into a separate structure The dev_pagemap is a growing too many callbacks. Move them into a separate ops structure so that they are not duplicated for multiple instances, and an attacker can't easily overwrite them. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- drivers/dax/device.c | 11 +++++++---- drivers/dax/pmem/core.c | 2 +- drivers/nvdimm/pmem.c | 19 +++++++++++-------- drivers/pci/p2pdma.c | 8 ++++++-- include/linux/memremap.h | 36 ++++++++++++++++++++---------------- kernel/memremap.c | 18 +++++++++--------- mm/hmm.c | 10 +++++++--- tools/testing/nvdimm/test/iomap.c | 7 ++++--- 8 files changed, 65 insertions(+), 46 deletions(-) (limited to 'tools/testing') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 79014baa782d..f390083a64d7 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -36,9 +36,8 @@ static void dev_dax_percpu_exit(struct percpu_ref *ref) percpu_ref_exit(ref); } -static void dev_dax_percpu_kill(struct percpu_ref *data) +static void dev_dax_percpu_kill(struct percpu_ref *ref) { - struct percpu_ref *ref = data; struct dev_dax *dev_dax = ref_to_dev_dax(ref); dev_dbg(&dev_dax->dev, "%s\n", __func__); @@ -442,6 +441,11 @@ static void dev_dax_kill(void *dev_dax) kill_dev_dax(dev_dax); } +static const struct dev_pagemap_ops dev_dax_pagemap_ops = { + .kill = dev_dax_percpu_kill, + .cleanup = dev_dax_percpu_exit, +}; + int dev_dax_probe(struct device *dev) { struct dev_dax *dev_dax = to_dev_dax(dev); @@ -466,9 +470,8 @@ int dev_dax_probe(struct device *dev) return rc; dev_dax->pgmap.ref = &dev_dax->ref; - dev_dax->pgmap.kill = dev_dax_percpu_kill; - dev_dax->pgmap.cleanup = dev_dax_percpu_exit; dev_dax->pgmap.type = MEMORY_DEVICE_DEVDAX; + dev_dax->pgmap.ops = &dev_dax_pagemap_ops; addr = devm_memremap_pages(dev, &dev_dax->pgmap); if (IS_ERR(addr)) return PTR_ERR(addr); diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c index f9f51786d556..6eb6dfdf19bf 100644 --- a/drivers/dax/pmem/core.c +++ b/drivers/dax/pmem/core.c @@ -16,7 +16,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) struct dev_dax *dev_dax; struct nd_namespace_io *nsio; struct dax_region *dax_region; - struct dev_pagemap pgmap = { 0 }; + struct dev_pagemap pgmap = { }; struct nd_namespace_common *ndns; struct nd_dax *nd_dax = to_nd_dax(dev); struct nd_pfn *nd_pfn = &nd_dax->nd_pfn; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 24d7fe7c74ed..c2449af2b388 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -303,7 +303,7 @@ static const struct attribute_group *pmem_attribute_groups[] = { NULL, }; -static void __pmem_release_queue(struct percpu_ref *ref) +static void pmem_pagemap_cleanup(struct percpu_ref *ref) { struct request_queue *q; @@ -313,10 +313,10 @@ static void __pmem_release_queue(struct percpu_ref *ref) static void pmem_release_queue(void *ref) { - __pmem_release_queue(ref); + pmem_pagemap_cleanup(ref); } -static void pmem_freeze_queue(struct percpu_ref *ref) +static void pmem_pagemap_kill(struct percpu_ref *ref) { struct request_queue *q; @@ -339,19 +339,24 @@ static void pmem_release_pgmap_ops(void *__pgmap) dev_pagemap_put_ops(); } -static void fsdax_pagefree(struct page *page, void *data) +static void pmem_pagemap_page_free(struct page *page, void *data) { wake_up_var(&page->_refcount); } +static const struct dev_pagemap_ops fsdax_pagemap_ops = { + .page_free = pmem_pagemap_page_free, + .kill = pmem_pagemap_kill, + .cleanup = pmem_pagemap_cleanup, +}; + static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap) { dev_pagemap_get_ops(); if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap)) return -ENOMEM; pgmap->type = MEMORY_DEVICE_FS_DAX; - pgmap->page_free = fsdax_pagefree; - + pgmap->ops = &fsdax_pagemap_ops; return 0; } @@ -409,8 +414,6 @@ static int pmem_attach_disk(struct device *dev, pmem->pfn_flags = PFN_DEV; pmem->pgmap.ref = &q->q_usage_counter; - pmem->pgmap.kill = pmem_freeze_queue; - pmem->pgmap.cleanup = __pmem_release_queue; if (is_nd_pfn(dev)) { if (setup_pagemap_fsdax(dev, &pmem->pgmap)) return -ENOMEM; diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index a4994aa3acc0..fb039259d463 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -153,6 +153,11 @@ out: return error; } +static const struct dev_pagemap_ops pci_p2pdma_pagemap_ops = { + .kill = pci_p2pdma_percpu_kill, + .cleanup = pci_p2pdma_percpu_cleanup, +}; + /** * pci_p2pdma_add_resource - add memory for use as p2p memory * @pdev: the device to add the memory to @@ -208,8 +213,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar); - pgmap->kill = pci_p2pdma_percpu_kill; - pgmap->cleanup = pci_p2pdma_percpu_cleanup; + pgmap->ops = &pci_p2pdma_pagemap_ops; addr = devm_memremap_pages(&pdev->dev, pgmap); if (IS_ERR(addr)) { diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 0c86f2c5ac9c..919755f48c7e 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -63,41 +63,45 @@ enum memory_type { MEMORY_DEVICE_PCI_P2PDMA, }; -/* - * Additional notes about MEMORY_DEVICE_PRIVATE may be found in - * include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief - * explanation in include/linux/memory_hotplug.h. - * - * The page_free() callback is called once the page refcount reaches 1 - * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug. - * This allows the device driver to implement its own memory management.) - */ -typedef void (*dev_page_free_t)(struct page *page, void *data); +struct dev_pagemap_ops { + /* + * Called once the page refcount reaches 1. (ZONE_DEVICE pages never + * reach 0 refcount unless there is a refcount bug. This allows the + * device driver to implement its own memory management.) + */ + void (*page_free)(struct page *page, void *data); + + /* + * Transition the refcount in struct dev_pagemap to the dead state. + */ + void (*kill)(struct percpu_ref *ref); + + /* + * Wait for refcount in struct dev_pagemap to be idle and reap it. + */ + void (*cleanup)(struct percpu_ref *ref); +}; /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings - * @page_free: free page callback when page refcount reaches 1 * @altmap: pre-allocated/reserved memory for vmemmap allocations * @res: physical address range covered by @ref * @ref: reference count that pins the devm_memremap_pages() mapping - * @kill: callback to transition @ref to the dead state - * @cleanup: callback to wait for @ref to be idle and reap it * @dev: host device of the mapping for debug * @data: private data pointer for page_free() * @type: memory type: see MEMORY_* in memory_hotplug.h + * @ops: method table */ struct dev_pagemap { - dev_page_free_t page_free; struct vmem_altmap altmap; bool altmap_valid; struct resource res; struct percpu_ref *ref; - void (*kill)(struct percpu_ref *ref); - void (*cleanup)(struct percpu_ref *ref); struct device *dev; void *data; enum memory_type type; u64 pci_p2pdma_bus_offset; + const struct dev_pagemap_ops *ops; }; #ifdef CONFIG_ZONE_DEVICE diff --git a/kernel/memremap.c b/kernel/memremap.c index abda62d1e5a3..0824237ef979 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -92,10 +92,10 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->kill(pgmap->ref); + pgmap->ops->kill(pgmap->ref); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); - pgmap->cleanup(pgmap->ref); + pgmap->ops->cleanup(pgmap->ref); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -128,8 +128,8 @@ static void devm_memremap_pages_release(void *data) * @pgmap: pointer to a struct dev_pagemap * * Notes: - * 1/ At a minimum the res, ref and type members of @pgmap must be initialized - * by the caller before passing it to this function + * 1/ At a minimum the res, ref and type and ops members of @pgmap must be + * initialized by the caller before passing it to this function * * 2/ The altmap field may optionally be initialized, in which case altmap_valid * must be set to true @@ -179,7 +179,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) break; } - if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) { + if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill || + !pgmap->ops->cleanup) { WARN(1, "Missing reference count teardown definition\n"); return ERR_PTR(-EINVAL); } @@ -293,9 +294,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->kill(pgmap->ref); - pgmap->cleanup(pgmap->ref); - + pgmap->ops->kill(pgmap->ref); + pgmap->ops->cleanup(pgmap->ref); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); @@ -388,7 +388,7 @@ void __put_devmap_managed_page(struct page *page) mem_cgroup_uncharge(page); - page->pgmap->page_free(page, page->pgmap->data); + page->pgmap->ops->page_free(page, page->pgmap->data); } else if (!count) __put_page(page); } diff --git a/mm/hmm.c b/mm/hmm.c index 48574f8485bb..583a02a16872 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1384,6 +1384,12 @@ static void hmm_devmem_free(struct page *page, void *data) devmem->ops->free(devmem, page); } +static const struct dev_pagemap_ops hmm_pagemap_ops = { + .page_free = hmm_devmem_free, + .kill = hmm_devmem_ref_kill, + .cleanup = hmm_devmem_ref_exit, +}; + /* * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory * @@ -1438,12 +1444,10 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; devmem->pagemap.res = *devmem->resource; - devmem->pagemap.page_free = hmm_devmem_free; + devmem->pagemap.ops = &hmm_pagemap_ops; devmem->pagemap.altmap_valid = false; devmem->pagemap.ref = &devmem->ref; devmem->pagemap.data = devmem; - devmem->pagemap.kill = hmm_devmem_ref_kill; - devmem->pagemap.cleanup = hmm_devmem_ref_exit; result = devm_memremap_pages(devmem->device, &devmem->pagemap); if (IS_ERR(result)) diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index 076df22e4bda..cf3f064a697d 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -100,9 +100,10 @@ static void nfit_test_kill(void *_pgmap) { struct dev_pagemap *pgmap = _pgmap; - WARN_ON(!pgmap || !pgmap->ref || !pgmap->kill || !pgmap->cleanup); - pgmap->kill(pgmap->ref); - pgmap->cleanup(pgmap->ref); + WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill || + !pgmap->ops->cleanup); + pgmap->ops->kill(pgmap->ref); + pgmap->ops->cleanup(pgmap->ref); } void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) -- cgit v1.2.3-55-g7522 From d8668bb0451c3c45b59dbcde2654e0539aad1d2a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:09 +0200 Subject: memremap: pass a struct dev_pagemap to ->kill and ->cleanup Passing the actual typed structure leads to more understandable code vs just passing the ref member. Reported-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- drivers/dax/device.c | 12 ++++++------ drivers/nvdimm/pmem.c | 18 +++++++++--------- drivers/pci/p2pdma.c | 9 +++++---- include/linux/memremap.h | 4 ++-- kernel/memremap.c | 8 ++++---- mm/hmm.c | 10 +++++----- tools/testing/nvdimm/test/iomap.c | 4 ++-- 7 files changed, 33 insertions(+), 32 deletions(-) (limited to 'tools/testing') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index f390083a64d7..b5257038c188 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -27,21 +27,21 @@ static void dev_dax_percpu_release(struct percpu_ref *ref) complete(&dev_dax->cmp); } -static void dev_dax_percpu_exit(struct percpu_ref *ref) +static void dev_dax_percpu_exit(struct dev_pagemap *pgmap) { - struct dev_dax *dev_dax = ref_to_dev_dax(ref); + struct dev_dax *dev_dax = container_of(pgmap, struct dev_dax, pgmap); dev_dbg(&dev_dax->dev, "%s\n", __func__); wait_for_completion(&dev_dax->cmp); - percpu_ref_exit(ref); + percpu_ref_exit(pgmap->ref); } -static void dev_dax_percpu_kill(struct percpu_ref *ref) +static void dev_dax_percpu_kill(struct dev_pagemap *pgmap) { - struct dev_dax *dev_dax = ref_to_dev_dax(ref); + struct dev_dax *dev_dax = container_of(pgmap, struct dev_dax, pgmap); dev_dbg(&dev_dax->dev, "%s\n", __func__); - percpu_ref_kill(ref); + percpu_ref_kill(pgmap->ref); } static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index c2449af2b388..9dac48359353 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -303,24 +303,24 @@ static const struct attribute_group *pmem_attribute_groups[] = { NULL, }; -static void pmem_pagemap_cleanup(struct percpu_ref *ref) +static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap) { - struct request_queue *q; + struct request_queue *q = + container_of(pgmap->ref, struct request_queue, q_usage_counter); - q = container_of(ref, typeof(*q), q_usage_counter); blk_cleanup_queue(q); } -static void pmem_release_queue(void *ref) +static void pmem_release_queue(void *pgmap) { - pmem_pagemap_cleanup(ref); + pmem_pagemap_cleanup(pgmap); } -static void pmem_pagemap_kill(struct percpu_ref *ref) +static void pmem_pagemap_kill(struct dev_pagemap *pgmap) { - struct request_queue *q; + struct request_queue *q = + container_of(pgmap->ref, struct request_queue, q_usage_counter); - q = container_of(ref, typeof(*q), q_usage_counter); blk_freeze_queue_start(q); } @@ -435,7 +435,7 @@ static int pmem_attach_disk(struct device *dev, memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); } else { if (devm_add_action_or_reset(dev, pmem_release_queue, - &q->q_usage_counter)) + &pmem->pgmap)) return -ENOMEM; addr = devm_memremap(dev, pmem->phys_addr, pmem->size, ARCH_MEMREMAP_PMEM); diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index fb039259d463..fa6249e4ed5f 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -91,14 +91,15 @@ static void pci_p2pdma_percpu_release(struct percpu_ref *ref) complete(&p2p_pgmap->ref_done); } -static void pci_p2pdma_percpu_kill(struct percpu_ref *ref) +static void pci_p2pdma_percpu_kill(struct dev_pagemap *pgmap) { - percpu_ref_kill(ref); + percpu_ref_kill(pgmap->ref); } -static void pci_p2pdma_percpu_cleanup(struct percpu_ref *ref) +static void pci_p2pdma_percpu_cleanup(struct dev_pagemap *pgmap) { - struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref); + struct p2pdma_pagemap *p2p_pgmap = + container_of(pgmap, struct p2pdma_pagemap, pgmap); wait_for_completion(&p2p_pgmap->ref_done); percpu_ref_exit(&p2p_pgmap->ref); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 919755f48c7e..b8666a0d8665 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -74,12 +74,12 @@ struct dev_pagemap_ops { /* * Transition the refcount in struct dev_pagemap to the dead state. */ - void (*kill)(struct percpu_ref *ref); + void (*kill)(struct dev_pagemap *pgmap); /* * Wait for refcount in struct dev_pagemap to be idle and reap it. */ - void (*cleanup)(struct percpu_ref *ref); + void (*cleanup)(struct dev_pagemap *pgmap); }; /** diff --git a/kernel/memremap.c b/kernel/memremap.c index 0824237ef979..00c1ceb60c19 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -92,10 +92,10 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->ops->kill(pgmap->ref); + pgmap->ops->kill(pgmap); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); - pgmap->ops->cleanup(pgmap->ref); + pgmap->ops->cleanup(pgmap); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -294,8 +294,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->ops->kill(pgmap->ref); - pgmap->ops->cleanup(pgmap->ref); + pgmap->ops->kill(pgmap); + pgmap->ops->cleanup(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); diff --git a/mm/hmm.c b/mm/hmm.c index 583a02a16872..987793fba923 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1352,18 +1352,18 @@ static void hmm_devmem_ref_release(struct percpu_ref *ref) complete(&devmem->completion); } -static void hmm_devmem_ref_exit(struct percpu_ref *ref) +static void hmm_devmem_ref_exit(struct dev_pagemap *pgmap) { struct hmm_devmem *devmem; - devmem = container_of(ref, struct hmm_devmem, ref); + devmem = container_of(pgmap, struct hmm_devmem, pagemap); wait_for_completion(&devmem->completion); - percpu_ref_exit(ref); + percpu_ref_exit(pgmap->ref); } -static void hmm_devmem_ref_kill(struct percpu_ref *ref) +static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap) { - percpu_ref_kill(ref); + percpu_ref_kill(pgmap->ref); } static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma, diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index cf3f064a697d..82f901569e06 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -102,8 +102,8 @@ static void nfit_test_kill(void *_pgmap) WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup); - pgmap->ops->kill(pgmap->ref); - pgmap->ops->cleanup(pgmap->ref); + pgmap->ops->kill(pgmap); + pgmap->ops->cleanup(pgmap); } void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) -- cgit v1.2.3-55-g7522 From 24917f6b1041a73993178920656e13364f847995 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:14 +0200 Subject: memremap: provide an optional internal refcount in struct dev_pagemap Provide an internal refcounting logic if no ->ref field is provided in the pagemap passed into devm_memremap_pages so that callers don't have to reinvent it poorly. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- include/linux/memremap.h | 4 +++ kernel/memremap.c | 64 +++++++++++++++++++++++++++++++-------- tools/testing/nvdimm/test/iomap.c | 58 +++++++++++++++++++++++++++-------- 3 files changed, 101 insertions(+), 25 deletions(-) (limited to 'tools/testing') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index e25685b878e9..f8a5b2a19945 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -95,6 +95,8 @@ struct dev_pagemap_ops { * @altmap: pre-allocated/reserved memory for vmemmap allocations * @res: physical address range covered by @ref * @ref: reference count that pins the devm_memremap_pages() mapping + * @internal_ref: internal reference if @ref is not provided by the caller + * @done: completion for @internal_ref * @dev: host device of the mapping for debug * @data: private data pointer for page_free() * @type: memory type: see MEMORY_* in memory_hotplug.h @@ -105,6 +107,8 @@ struct dev_pagemap { struct vmem_altmap altmap; struct resource res; struct percpu_ref *ref; + struct percpu_ref internal_ref; + struct completion done; struct device *dev; enum memory_type type; unsigned int flags; diff --git a/kernel/memremap.c b/kernel/memremap.c index eee490e7d7e1..bea6f887adad 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -29,7 +29,7 @@ static void devmap_managed_enable_put(void *data) static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) { - if (!pgmap->ops->page_free) { + if (!pgmap->ops || !pgmap->ops->page_free) { WARN(1, "Missing page_free method\n"); return -EINVAL; } @@ -75,6 +75,24 @@ static unsigned long pfn_next(unsigned long pfn) #define for_each_device_pfn(pfn, map) \ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) +static void dev_pagemap_kill(struct dev_pagemap *pgmap) +{ + if (pgmap->ops && pgmap->ops->kill) + pgmap->ops->kill(pgmap); + else + percpu_ref_kill(pgmap->ref); +} + +static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) +{ + if (pgmap->ops && pgmap->ops->cleanup) { + pgmap->ops->cleanup(pgmap); + } else { + wait_for_completion(&pgmap->done); + percpu_ref_exit(pgmap->ref); + } +} + static void devm_memremap_pages_release(void *data) { struct dev_pagemap *pgmap = data; @@ -84,10 +102,10 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->ops->kill(pgmap); + dev_pagemap_kill(pgmap); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); - pgmap->ops->cleanup(pgmap); + dev_pagemap_cleanup(pgmap); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -114,20 +132,29 @@ static void devm_memremap_pages_release(void *data) "%s: failed to free all reserved pages\n", __func__); } +static void dev_pagemap_percpu_release(struct percpu_ref *ref) +{ + struct dev_pagemap *pgmap = + container_of(ref, struct dev_pagemap, internal_ref); + + complete(&pgmap->done); +} + /** * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res * @pgmap: pointer to a struct dev_pagemap * * Notes: - * 1/ At a minimum the res, ref and type and ops members of @pgmap must be - * initialized by the caller before passing it to this function + * 1/ At a minimum the res and type members of @pgmap must be initialized + * by the caller before passing it to this function * * 2/ The altmap field may optionally be initialized, in which case * PGMAP_ALTMAP_VALID must be set in pgmap->flags. * - * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped - * at devm_memremap_pages_release() time, or if this routine fails. + * 3/ The ref field may optionally be provided, in which pgmap->ref must be + * 'live' on entry and will be killed and reaped at + * devm_memremap_pages_release() time, or if this routine fails. * * 4/ res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but @@ -175,10 +202,21 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) break; } - if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill || - !pgmap->ops->cleanup) { - WARN(1, "Missing reference count teardown definition\n"); - return ERR_PTR(-EINVAL); + if (!pgmap->ref) { + if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) + return ERR_PTR(-EINVAL); + + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->internal_ref, + dev_pagemap_percpu_release, 0, GFP_KERNEL); + if (error) + return ERR_PTR(error); + pgmap->ref = &pgmap->internal_ref; + } else { + if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { + WARN(1, "Missing reference count teardown definition\n"); + return ERR_PTR(-EINVAL); + } } if (need_devmap_managed) { @@ -296,8 +334,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->ops->kill(pgmap); - pgmap->ops->cleanup(pgmap); + dev_pagemap_kill(pgmap); + dev_pagemap_cleanup(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index 82f901569e06..cd040b5abffe 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -100,26 +100,60 @@ static void nfit_test_kill(void *_pgmap) { struct dev_pagemap *pgmap = _pgmap; - WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill || - !pgmap->ops->cleanup); - pgmap->ops->kill(pgmap); - pgmap->ops->cleanup(pgmap); + WARN_ON(!pgmap || !pgmap->ref); + + if (pgmap->ops && pgmap->ops->kill) + pgmap->ops->kill(pgmap); + else + percpu_ref_kill(pgmap->ref); + + if (pgmap->ops && pgmap->ops->cleanup) { + pgmap->ops->cleanup(pgmap); + } else { + wait_for_completion(&pgmap->done); + percpu_ref_exit(pgmap->ref); + } +} + +static void dev_pagemap_percpu_release(struct percpu_ref *ref) +{ + struct dev_pagemap *pgmap = + container_of(ref, struct dev_pagemap, internal_ref); + + complete(&pgmap->done); } void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { + int error; resource_size_t offset = pgmap->res.start; struct nfit_test_resource *nfit_res = get_nfit_res(offset); - if (nfit_res) { - int rc; - - rc = devm_add_action_or_reset(dev, nfit_test_kill, pgmap); - if (rc) - return ERR_PTR(rc); - return nfit_res->buf + offset - nfit_res->res.start; + if (!nfit_res) + return devm_memremap_pages(dev, pgmap); + + pgmap->dev = dev; + if (!pgmap->ref) { + if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) + return ERR_PTR(-EINVAL); + + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->internal_ref, + dev_pagemap_percpu_release, 0, GFP_KERNEL); + if (error) + return ERR_PTR(error); + pgmap->ref = &pgmap->internal_ref; + } else { + if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { + WARN(1, "Missing reference count teardown definition\n"); + return ERR_PTR(-EINVAL); + } } - return devm_memremap_pages(dev, pgmap); + + error = devm_add_action_or_reset(dev, nfit_test_kill, pgmap); + if (error) + return ERR_PTR(error); + return nfit_res->buf + offset - nfit_res->res.start; } EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages); -- cgit v1.2.3-55-g7522 From 7b570361f6f66c91443541b19121038c076e7d64 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 17 Jun 2019 16:52:04 +0200 Subject: selftests/powerpc: Add missing newline at end of file "git diff" says: \ No newline at end of file after modifying the file. Signed-off-by: Geert Uytterhoeven [mpe: Rebase since addition of another test] Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/mm/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/powerpc/mm/.gitignore b/tools/testing/selftests/powerpc/mm/.gitignore index d503b8764a8e..7101ffd08d66 100644 --- a/tools/testing/selftests/powerpc/mm/.gitignore +++ b/tools/testing/selftests/powerpc/mm/.gitignore @@ -4,4 +4,4 @@ tempfile prot_sao segv_errors wild_bctr -large_vm_fork_separation \ No newline at end of file +large_vm_fork_separation -- cgit v1.2.3-55-g7522 From 5a1ea4774ddc2c6bc3ba1415880091eccf1a901e Mon Sep 17 00:00:00 2001 From: Naveen N. Rao Date: Wed, 3 Jul 2019 22:33:59 +0530 Subject: powerpc/pseries: Move mm/book3s64/vphn.c under platforms/pseries/ hcall_vphn() is specific to pseries and will be used in a subsequent patch. So, move it to a more appropriate place under arch/powerpc/platforms/pseries. Also merge vphn.h into lppaca.h and update vphn selftest to use the new files. Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/lppaca.h | 24 ++++++ arch/powerpc/mm/book3s64/Makefile | 1 - arch/powerpc/mm/book3s64/vphn.c | 73 ------------------- arch/powerpc/mm/book3s64/vphn.h | 24 ------ arch/powerpc/mm/numa.c | 14 ---- arch/powerpc/platforms/pseries/Makefile | 1 + arch/powerpc/platforms/pseries/vphn.c | 89 +++++++++++++++++++++++ tools/testing/selftests/powerpc/vphn/Makefile | 2 +- tools/testing/selftests/powerpc/vphn/asm/lppaca.h | 1 + tools/testing/selftests/powerpc/vphn/vphn.c | 2 +- tools/testing/selftests/powerpc/vphn/vphn.h | 1 - 11 files changed, 117 insertions(+), 115 deletions(-) delete mode 100644 arch/powerpc/mm/book3s64/vphn.c delete mode 100644 arch/powerpc/mm/book3s64/vphn.h create mode 100644 arch/powerpc/platforms/pseries/vphn.c create mode 120000 tools/testing/selftests/powerpc/vphn/asm/lppaca.h delete mode 120000 tools/testing/selftests/powerpc/vphn/vphn.h (limited to 'tools/testing') diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h index e45b7293414d..f5195b4d9ffb 100644 --- a/arch/powerpc/include/asm/lppaca.h +++ b/arch/powerpc/include/asm/lppaca.h @@ -18,6 +18,29 @@ */ #ifndef _ASM_POWERPC_LPPACA_H #define _ASM_POWERPC_LPPACA_H + +/* + * The below VPHN macros are outside the __KERNEL__ check since these are + * used for compiling the vphn selftest in userspace + */ + +/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. */ +#define VPHN_REGISTER_COUNT 6 + +/* + * 6 64-bit registers unpacked into up to 24 be32 associativity values. To + * form the complete property we have to add the length in the first cell. + */ +#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1) + +/* + * The H_HOME_NODE_ASSOCIATIVITY hcall takes two values for flags: + * 1 for retrieving associativity information for a guest cpu + * 2 for retrieving associativity information for a host/hypervisor cpu + */ +#define VPHN_FLAG_VCPU 1 +#define VPHN_FLAG_PCPU 2 + #ifdef __KERNEL__ /* @@ -179,6 +202,7 @@ extern void (*dtl_consumer)(struct dtl_entry *entry, u64 index); extern void register_dtl_buffer(int cpu); extern void alloc_dtl_buffers(void); +extern long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity); #endif /* CONFIG_PPC_BOOK3S */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile index 974b4fc19f4f..fd393b8be14f 100644 --- a/arch/powerpc/mm/book3s64/Makefile +++ b/arch/powerpc/mm/book3s64/Makefile @@ -10,7 +10,6 @@ obj-$(CONFIG_PPC_NATIVE) += hash_native.o obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o -obj-$(CONFIG_PPC_SPLPAR) += vphn.o obj-$(CONFIG_HUGETLB_PAGE) += hash_hugetlbpage.o ifdef CONFIG_HUGETLB_PAGE obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o diff --git a/arch/powerpc/mm/book3s64/vphn.c b/arch/powerpc/mm/book3s64/vphn.c deleted file mode 100644 index 0ee7734afb50..000000000000 --- a/arch/powerpc/mm/book3s64/vphn.c +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include "vphn.h" - -/* - * The associativity domain numbers are returned from the hypervisor as a - * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the - * special value of "all ones" (aka. 0xffff) and its size may not exceed 48 - * bytes. - * - * --- 16-bit fields --> - * _________________________ - * | 0 | 1 | 2 | 3 | be_packed[0] - * ------+-----+-----+------ - * _________________________ - * | 4 | 5 | 6 | 7 | be_packed[1] - * ------------------------- - * ... - * _________________________ - * | 20 | 21 | 22 | 23 | be_packed[5] - * ------------------------- - * - * Convert to the sequence they would appear in the ibm,associativity property. - */ -int vphn_unpack_associativity(const long *packed, __be32 *unpacked) -{ - __be64 be_packed[VPHN_REGISTER_COUNT]; - int i, nr_assoc_doms = 0; - const __be16 *field = (const __be16 *) be_packed; - u16 last = 0; - bool is_32bit = false; - -#define VPHN_FIELD_UNUSED (0xffff) -#define VPHN_FIELD_MSB (0x8000) -#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) - - /* Let's fix the values returned by plpar_hcall9() */ - for (i = 0; i < VPHN_REGISTER_COUNT; i++) - be_packed[i] = cpu_to_be64(packed[i]); - - for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) { - u16 new = be16_to_cpup(field++); - - if (is_32bit) { - /* - * Let's concatenate the 16 bits of this field to the - * 15 lower bits of the previous field - */ - unpacked[++nr_assoc_doms] = - cpu_to_be32(last << 16 | new); - is_32bit = false; - } else if (new == VPHN_FIELD_UNUSED) - /* This is the list terminator */ - break; - else if (new & VPHN_FIELD_MSB) { - /* Data is in the lower 15 bits of this field */ - unpacked[++nr_assoc_doms] = - cpu_to_be32(new & VPHN_FIELD_MASK); - } else { - /* - * Data is in the lower 15 bits of this field - * concatenated with the next 16 bit field - */ - last = new; - is_32bit = true; - } - } - - /* The first cell contains the length of the property */ - unpacked[0] = cpu_to_be32(nr_assoc_doms); - - return nr_assoc_doms; -} diff --git a/arch/powerpc/mm/book3s64/vphn.h b/arch/powerpc/mm/book3s64/vphn.h deleted file mode 100644 index f7ff1e0c3801..000000000000 --- a/arch/powerpc/mm/book3s64/vphn.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ARCH_POWERPC_MM_VPHN_H_ -#define _ARCH_POWERPC_MM_VPHN_H_ - -/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. */ -#define VPHN_REGISTER_COUNT 6 - -/* - * 6 64-bit registers unpacked into up to 24 be32 associativity values. To - * form the complete property we have to add the length in the first cell. - */ -#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1) - -/* - * The H_HOME_NODE_ASSOCIATIVITY hcall takes two values for flags: - * 1 for retrieving associativity information for a guest cpu - * 2 for retrieving associativity information for a host/hypervisor cpu - */ -#define VPHN_FLAG_VCPU 1 -#define VPHN_FLAG_PCPU 2 - -extern int vphn_unpack_associativity(const long *packed, __be32 *unpacked); - -#endif diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 57f006b6214b..50fadc99897b 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1067,9 +1067,6 @@ u64 memory_hotplug_max(void) /* Virtual Processor Home Node (VPHN) support */ #ifdef CONFIG_PPC_SPLPAR - -#include "book3s64/vphn.h" - struct topology_update_data { struct topology_update_data *next; unsigned int cpu; @@ -1087,17 +1084,6 @@ static void reset_topology_timer(void); static int topology_timer_secs = 1; static int topology_inited; -static long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity) -{ - long rc; - long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; - - rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, cpu); - vphn_unpack_associativity(retbuf, associativity); - - return rc; -} - /* * Change polling interval for associativity changes. */ diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index a43ec843c8e2..ab3d59aeacca 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_LPARCFG) += lparcfg.o obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o obj-$(CONFIG_PAPR_SCM) += papr_scm.o +obj-$(CONFIG_PPC_SPLPAR) += vphn.o ifdef CONFIG_PPC_PSERIES obj-$(CONFIG_SUSPEND) += suspend.o diff --git a/arch/powerpc/platforms/pseries/vphn.c b/arch/powerpc/platforms/pseries/vphn.c new file mode 100644 index 000000000000..3f07bf6c670e --- /dev/null +++ b/arch/powerpc/platforms/pseries/vphn.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +/* + * The associativity domain numbers are returned from the hypervisor as a + * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the + * special value of "all ones" (aka. 0xffff) and its size may not exceed 48 + * bytes. + * + * --- 16-bit fields --> + * _________________________ + * | 0 | 1 | 2 | 3 | be_packed[0] + * ------+-----+-----+------ + * _________________________ + * | 4 | 5 | 6 | 7 | be_packed[1] + * ------------------------- + * ... + * _________________________ + * | 20 | 21 | 22 | 23 | be_packed[5] + * ------------------------- + * + * Convert to the sequence they would appear in the ibm,associativity property. + */ +static int vphn_unpack_associativity(const long *packed, __be32 *unpacked) +{ + __be64 be_packed[VPHN_REGISTER_COUNT]; + int i, nr_assoc_doms = 0; + const __be16 *field = (const __be16 *) be_packed; + u16 last = 0; + bool is_32bit = false; + +#define VPHN_FIELD_UNUSED (0xffff) +#define VPHN_FIELD_MSB (0x8000) +#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) + + /* Let's fix the values returned by plpar_hcall9() */ + for (i = 0; i < VPHN_REGISTER_COUNT; i++) + be_packed[i] = cpu_to_be64(packed[i]); + + for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) { + u16 new = be16_to_cpup(field++); + + if (is_32bit) { + /* + * Let's concatenate the 16 bits of this field to the + * 15 lower bits of the previous field + */ + unpacked[++nr_assoc_doms] = + cpu_to_be32(last << 16 | new); + is_32bit = false; + } else if (new == VPHN_FIELD_UNUSED) + /* This is the list terminator */ + break; + else if (new & VPHN_FIELD_MSB) { + /* Data is in the lower 15 bits of this field */ + unpacked[++nr_assoc_doms] = + cpu_to_be32(new & VPHN_FIELD_MASK); + } else { + /* + * Data is in the lower 15 bits of this field + * concatenated with the next 16 bit field + */ + last = new; + is_32bit = true; + } + } + + /* The first cell contains the length of the property */ + unpacked[0] = cpu_to_be32(nr_assoc_doms); + + return nr_assoc_doms; +} + +/* NOTE: This file is included by a selftest and built in userspace. */ +#ifdef __KERNEL__ +#include + +long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity) +{ + long rc; + long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; + + rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, cpu); + vphn_unpack_associativity(retbuf, associativity); + + return rc; +} +#endif diff --git a/tools/testing/selftests/powerpc/vphn/Makefile b/tools/testing/selftests/powerpc/vphn/Makefile index 18b885da01bd..cf65cbf33085 100644 --- a/tools/testing/selftests/powerpc/vphn/Makefile +++ b/tools/testing/selftests/powerpc/vphn/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only TEST_GEN_PROGS := test-vphn -CFLAGS += -m64 +CFLAGS += -m64 -I$(CURDIR) top_srcdir = ../../../../.. include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/vphn/asm/lppaca.h b/tools/testing/selftests/powerpc/vphn/asm/lppaca.h new file mode 120000 index 000000000000..942b1d00999c --- /dev/null +++ b/tools/testing/selftests/powerpc/vphn/asm/lppaca.h @@ -0,0 +1 @@ +../../../../../../arch/powerpc/include/asm/lppaca.h \ No newline at end of file diff --git a/tools/testing/selftests/powerpc/vphn/vphn.c b/tools/testing/selftests/powerpc/vphn/vphn.c index 1d1f5f2be3b2..5b5fbddccabd 120000 --- a/tools/testing/selftests/powerpc/vphn/vphn.c +++ b/tools/testing/selftests/powerpc/vphn/vphn.c @@ -1 +1 @@ -../../../../../arch/powerpc/mm/book3s64/vphn.c \ No newline at end of file +../../../../../arch/powerpc/platforms/pseries/vphn.c \ No newline at end of file diff --git a/tools/testing/selftests/powerpc/vphn/vphn.h b/tools/testing/selftests/powerpc/vphn/vphn.h deleted file mode 120000 index 45fe160f8288..000000000000 --- a/tools/testing/selftests/powerpc/vphn/vphn.h +++ /dev/null @@ -1 +0,0 @@ -../../../../../arch/powerpc/mm/book3s64/vphn.h \ No newline at end of file -- cgit v1.2.3-55-g7522 From 59d82657a08d4a9e6e2d99acc525ef748d36016b Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 10 Jul 2019 13:56:54 +0200 Subject: selftests/bpf: fix bpf_target_sparc check bpf_helpers.h fails to compile on sparc: the code should be checking for defined(bpf_target_sparc), but checks simply for bpf_target_sparc. Also change #ifdef bpf_target_powerpc to #if defined() for consistency. Signed-off-by: Ilya Leoshkevich Acked-by: Andrii Nakryiko Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/bpf_helpers.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 5a3d92c8bec8..5d63c66bc9d7 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -452,10 +452,10 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #endif -#ifdef bpf_target_powerpc +#if defined(bpf_target_powerpc) #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; }) #define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP -#elif bpf_target_sparc +#elif defined(bpf_target_sparc) #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = PT_REGS_RET(ctx); }) #define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP #else -- cgit v1.2.3-55-g7522 From 9cae4ace80ef39005da106fbb89c952b27d7b89e Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 11 Jul 2019 11:12:49 +0200 Subject: selftests/bpf: do not ignore clang failures When compiling an eBPF prog fails, make still returns 0, because failing clang command's output is piped to llc and therefore its exit status is ignored. When clang fails, pipe the string "clang failed" to llc. This will make llc fail with an informative error message. This solution was chosen over using pipefail, having separate targets or getting rid of llc invocation due to its simplicity. In addition, pull Kbuild.include in order to get .DELETE_ON_ERROR target, which would cause partial .o files to be removed. Signed-off-by: Ilya Leoshkevich Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/Makefile | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 2620406a53ec..8312db1ae7c2 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 +include ../../../../scripts/Kbuild.include LIBDIR := ../../../lib BPFDIR := $(LIBDIR)/bpf @@ -196,8 +197,8 @@ $(ALU32_BUILD_DIR)/test_progs_32: prog_tests/*.c $(ALU32_BUILD_DIR)/%.o: progs/%.c $(ALU32_BUILD_DIR) \ $(ALU32_BUILD_DIR)/test_progs_32 - $(CLANG) $(CLANG_FLAGS) \ - -O2 -target bpf -emit-llvm -c $< -o - | \ + ($(CLANG) $(CLANG_FLAGS) -O2 -target bpf -emit-llvm -c $< -o - || \ + echo "clang failed") | \ $(LLC) -march=bpf -mattr=+alu32 -mcpu=$(CPU) $(LLC_FLAGS) \ -filetype=obj -o $@ ifeq ($(DWARF2BTF),y) @@ -208,16 +209,16 @@ endif # Have one program compiled without "-target bpf" to test whether libbpf loads # it successfully $(OUTPUT)/test_xdp.o: progs/test_xdp.c - $(CLANG) $(CLANG_FLAGS) \ - -O2 -emit-llvm -c $< -o - | \ + ($(CLANG) $(CLANG_FLAGS) -O2 -emit-llvm -c $< -o - || \ + echo "clang failed") | \ $(LLC) -march=bpf -mcpu=$(CPU) $(LLC_FLAGS) -filetype=obj -o $@ ifeq ($(DWARF2BTF),y) $(BTF_PAHOLE) -J $@ endif $(OUTPUT)/%.o: progs/%.c - $(CLANG) $(CLANG_FLAGS) \ - -O2 -target bpf -emit-llvm -c $< -o - | \ + ($(CLANG) $(CLANG_FLAGS) -O2 -target bpf -emit-llvm -c $< -o - || \ + echo "clang failed") | \ $(LLC) -march=bpf -mcpu=$(CPU) $(LLC_FLAGS) -filetype=obj -o $@ ifeq ($(DWARF2BTF),y) $(BTF_PAHOLE) -J $@ -- cgit v1.2.3-55-g7522 From 748e50c1c13d33dffcdca66f0b1cb3d96d17fe31 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 11 Jul 2019 16:29:27 +0200 Subject: selftests/bpf: compile progs with -D__TARGET_ARCH_$(SRCARCH) This opens up the possibility of accessing registers in an arch-independent way. Signed-off-by: Ilya Leoshkevich Reviewed-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8312db1ae7c2..277d8605e340 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 include ../../../../scripts/Kbuild.include +include ../../../scripts/Makefile.arch LIBDIR := ../../../lib BPFDIR := $(LIBDIR)/bpf @@ -139,7 +140,8 @@ CLANG_SYS_INCLUDES := $(shell $(CLANG) -v -E - &1 \ CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \ $(CLANG_SYS_INCLUDES) \ - -Wno-compare-distinct-pointer-types + -Wno-compare-distinct-pointer-types \ + -D__TARGET_ARCH_$(SRCARCH) $(OUTPUT)/test_l4lb_noinline.o: CLANG_FLAGS += -fno-inline $(OUTPUT)/test_xdp_noinline.o: CLANG_FLAGS += -fno-inline -- cgit v1.2.3-55-g7522 From 05c2dc17dae310f92a0c5979e227133b0e85f2fa Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 11 Jul 2019 16:29:28 +0200 Subject: selftests/bpf: fix s930 -> s390 typo Also check for __s390__ instead of __s390x__, just in case bpf_helpers.h is ever used by 32-bit userspace. Signed-off-by: Ilya Leoshkevich Acked-by: Andrii Nakryiko Reviewed-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/bpf_helpers.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 5d63c66bc9d7..f6e3768660be 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -315,8 +315,8 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #if defined(__TARGET_ARCH_x86) #define bpf_target_x86 #define bpf_target_defined -#elif defined(__TARGET_ARCH_s930x) - #define bpf_target_s930x +#elif defined(__TARGET_ARCH_s390) + #define bpf_target_s390 #define bpf_target_defined #elif defined(__TARGET_ARCH_arm) #define bpf_target_arm @@ -341,8 +341,8 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #ifndef bpf_target_defined #if defined(__x86_64__) #define bpf_target_x86 -#elif defined(__s390x__) - #define bpf_target_s930x +#elif defined(__s390__) + #define bpf_target_s390 #elif defined(__arm__) #define bpf_target_arm #elif defined(__aarch64__) @@ -369,7 +369,7 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #define PT_REGS_SP(x) ((x)->sp) #define PT_REGS_IP(x) ((x)->ip) -#elif defined(bpf_target_s390x) +#elif defined(bpf_target_s390) #define PT_REGS_PARM1(x) ((x)->gprs[2]) #define PT_REGS_PARM2(x) ((x)->gprs[3]) -- cgit v1.2.3-55-g7522 From 7cd04535abc95c50ba9bfa48efc76274b63c70f5 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 11 Jul 2019 16:29:29 +0200 Subject: selftests/bpf: make PT_REGS_* work in userspace Right now, on certain architectures, these macros are usable only with kernel headers. This patch makes it possible to use them with userspace headers and, as a consequence, not only in BPF samples, but also in BPF selftests. On s390, provide the forward declaration of struct pt_regs and cast it to user_pt_regs in PT_REGS_* macros. This is necessary, because instead of the full struct pt_regs, s390 exposes only its first member user_pt_regs to userspace, and bpf_helpers.h is used with both userspace (in selftests) and kernel (in samples) headers. It was added in commit 466698e654e8 ("s390/bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type"). Ditto on arm64. On x86, provide userspace versions of PT_REGS_* macros. Unlike s390 and arm64, x86 provides struct pt_regs to both userspace and kernel, however, with different member names. Signed-off-by: Ilya Leoshkevich Acked-by: Andrii Nakryiko Reviewed-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/bpf_helpers.h | 75 ++++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 20 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index f6e3768660be..f804f210244e 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -358,6 +358,7 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #if defined(bpf_target_x86) +#ifdef __KERNEL__ #define PT_REGS_PARM1(x) ((x)->di) #define PT_REGS_PARM2(x) ((x)->si) #define PT_REGS_PARM3(x) ((x)->dx) @@ -368,19 +369,49 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #define PT_REGS_RC(x) ((x)->ax) #define PT_REGS_SP(x) ((x)->sp) #define PT_REGS_IP(x) ((x)->ip) +#else +#ifdef __i386__ +/* i386 kernel is built with -mregparm=3 */ +#define PT_REGS_PARM1(x) ((x)->eax) +#define PT_REGS_PARM2(x) ((x)->edx) +#define PT_REGS_PARM3(x) ((x)->ecx) +#define PT_REGS_PARM4(x) 0 +#define PT_REGS_PARM5(x) 0 +#define PT_REGS_RET(x) ((x)->esp) +#define PT_REGS_FP(x) ((x)->ebp) +#define PT_REGS_RC(x) ((x)->eax) +#define PT_REGS_SP(x) ((x)->esp) +#define PT_REGS_IP(x) ((x)->eip) +#else +#define PT_REGS_PARM1(x) ((x)->rdi) +#define PT_REGS_PARM2(x) ((x)->rsi) +#define PT_REGS_PARM3(x) ((x)->rdx) +#define PT_REGS_PARM4(x) ((x)->rcx) +#define PT_REGS_PARM5(x) ((x)->r8) +#define PT_REGS_RET(x) ((x)->rsp) +#define PT_REGS_FP(x) ((x)->rbp) +#define PT_REGS_RC(x) ((x)->rax) +#define PT_REGS_SP(x) ((x)->rsp) +#define PT_REGS_IP(x) ((x)->rip) +#endif +#endif #elif defined(bpf_target_s390) -#define PT_REGS_PARM1(x) ((x)->gprs[2]) -#define PT_REGS_PARM2(x) ((x)->gprs[3]) -#define PT_REGS_PARM3(x) ((x)->gprs[4]) -#define PT_REGS_PARM4(x) ((x)->gprs[5]) -#define PT_REGS_PARM5(x) ((x)->gprs[6]) -#define PT_REGS_RET(x) ((x)->gprs[14]) -#define PT_REGS_FP(x) ((x)->gprs[11]) /* Works only with CONFIG_FRAME_POINTER */ -#define PT_REGS_RC(x) ((x)->gprs[2]) -#define PT_REGS_SP(x) ((x)->gprs[15]) -#define PT_REGS_IP(x) ((x)->psw.addr) +/* s390 provides user_pt_regs instead of struct pt_regs to userspace */ +struct pt_regs; +#define PT_REGS_S390 const volatile user_pt_regs +#define PT_REGS_PARM1(x) (((PT_REGS_S390 *)(x))->gprs[2]) +#define PT_REGS_PARM2(x) (((PT_REGS_S390 *)(x))->gprs[3]) +#define PT_REGS_PARM3(x) (((PT_REGS_S390 *)(x))->gprs[4]) +#define PT_REGS_PARM4(x) (((PT_REGS_S390 *)(x))->gprs[5]) +#define PT_REGS_PARM5(x) (((PT_REGS_S390 *)(x))->gprs[6]) +#define PT_REGS_RET(x) (((PT_REGS_S390 *)(x))->gprs[14]) +/* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_FP(x) (((PT_REGS_S390 *)(x))->gprs[11]) +#define PT_REGS_RC(x) (((PT_REGS_S390 *)(x))->gprs[2]) +#define PT_REGS_SP(x) (((PT_REGS_S390 *)(x))->gprs[15]) +#define PT_REGS_IP(x) (((PT_REGS_S390 *)(x))->psw.addr) #elif defined(bpf_target_arm) @@ -397,16 +428,20 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, #elif defined(bpf_target_arm64) -#define PT_REGS_PARM1(x) ((x)->regs[0]) -#define PT_REGS_PARM2(x) ((x)->regs[1]) -#define PT_REGS_PARM3(x) ((x)->regs[2]) -#define PT_REGS_PARM4(x) ((x)->regs[3]) -#define PT_REGS_PARM5(x) ((x)->regs[4]) -#define PT_REGS_RET(x) ((x)->regs[30]) -#define PT_REGS_FP(x) ((x)->regs[29]) /* Works only with CONFIG_FRAME_POINTER */ -#define PT_REGS_RC(x) ((x)->regs[0]) -#define PT_REGS_SP(x) ((x)->sp) -#define PT_REGS_IP(x) ((x)->pc) +/* arm64 provides struct user_pt_regs instead of struct pt_regs to userspace */ +struct pt_regs; +#define PT_REGS_ARM64 const volatile struct user_pt_regs +#define PT_REGS_PARM1(x) (((PT_REGS_ARM64 *)(x))->regs[0]) +#define PT_REGS_PARM2(x) (((PT_REGS_ARM64 *)(x))->regs[1]) +#define PT_REGS_PARM3(x) (((PT_REGS_ARM64 *)(x))->regs[2]) +#define PT_REGS_PARM4(x) (((PT_REGS_ARM64 *)(x))->regs[3]) +#define PT_REGS_PARM5(x) (((PT_REGS_ARM64 *)(x))->regs[4]) +#define PT_REGS_RET(x) (((PT_REGS_ARM64 *)(x))->regs[30]) +/* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_FP(x) (((PT_REGS_ARM64 *)(x))->regs[29]) +#define PT_REGS_RC(x) (((PT_REGS_ARM64 *)(x))->regs[0]) +#define PT_REGS_SP(x) (((PT_REGS_ARM64 *)(x))->sp) +#define PT_REGS_IP(x) (((PT_REGS_ARM64 *)(x))->pc) #elif defined(bpf_target_mips) -- cgit v1.2.3-55-g7522 From af3c24e0e2ed25177b03b60ada9117bb596e8d95 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 11 Jul 2019 16:29:30 +0200 Subject: selftests/bpf: fix compiling loop{1, 2, 3}.c on s390 Use PT_REGS_RC(ctx) instead of ctx->rax, which is not present on s390. Signed-off-by: Ilya Leoshkevich Reviewed-by: Stanislav Fomichev Tested-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/progs/loop1.c | 2 +- tools/testing/selftests/bpf/progs/loop2.c | 2 +- tools/testing/selftests/bpf/progs/loop3.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/loop1.c b/tools/testing/selftests/bpf/progs/loop1.c index dea395af9ea9..7cdb7f878310 100644 --- a/tools/testing/selftests/bpf/progs/loop1.c +++ b/tools/testing/selftests/bpf/progs/loop1.c @@ -18,7 +18,7 @@ int nested_loops(volatile struct pt_regs* ctx) for (j = 0; j < 300; j++) for (i = 0; i < j; i++) { if (j & 1) - m = ctx->rax; + m = PT_REGS_RC(ctx); else m = j; sum += i * m; diff --git a/tools/testing/selftests/bpf/progs/loop2.c b/tools/testing/selftests/bpf/progs/loop2.c index 0637bd8e8bcf..9b2f808a2863 100644 --- a/tools/testing/selftests/bpf/progs/loop2.c +++ b/tools/testing/selftests/bpf/progs/loop2.c @@ -16,7 +16,7 @@ int while_true(volatile struct pt_regs* ctx) int i = 0; while (true) { - if (ctx->rax & 1) + if (PT_REGS_RC(ctx) & 1) i += 3; else i += 7; diff --git a/tools/testing/selftests/bpf/progs/loop3.c b/tools/testing/selftests/bpf/progs/loop3.c index 30a0f6cba080..d727657d51e2 100644 --- a/tools/testing/selftests/bpf/progs/loop3.c +++ b/tools/testing/selftests/bpf/progs/loop3.c @@ -16,7 +16,7 @@ int while_true(volatile struct pt_regs* ctx) __u64 i = 0, sum = 0; do { i++; - sum += ctx->rax; + sum += PT_REGS_RC(ctx); } while (i < 0x100000000ULL); return sum; } -- cgit v1.2.3-55-g7522 From 100c4043b808739d808bb5c9f6868d3808f062ea Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Thu, 11 Jul 2019 12:29:00 -0400 Subject: tc-tests: updated skbedit tests - Added mask upper bound test case - Added mask validation test case - Added mask replacement case Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- .../tc-testing/tc-tests/actions/skbedit.json | 117 +++++++++++++++++++++ 1 file changed, 117 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json b/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json index 45e7e89928a5..bf5ebf59c2d4 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json @@ -69,6 +69,123 @@ "matchCount": "0", "teardown": [] }, + { + "id": "d4cd", + "name": "Add skbedit action with valid mark and mask", + "category": [ + "actions", + "skbedit" + ], + "setup": [ + [ + "$TC actions flush action skbedit", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action skbedit mark 1/0xaabb", + "expExitCode": "0", + "verifyCmd": "$TC actions list action skbedit", + "matchPattern": "action order [0-9]*: skbedit mark 1/0xaabb", + "matchCount": "1", + "teardown": [ + "$TC actions flush action skbedit" + ] + }, + { + "id": "baa7", + "name": "Add skbedit action with valid mark and 32-bit maximum mask", + "category": [ + "actions", + "skbedit" + ], + "setup": [ + [ + "$TC actions flush action skbedit", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action skbedit mark 1/0xffffffff", + "expExitCode": "0", + "verifyCmd": "$TC actions list action skbedit", + "matchPattern": "action order [0-9]*: skbedit mark 1/0xffffffff", + "matchCount": "1", + "teardown": [ + "$TC actions flush action skbedit" + ] + }, + { + "id": "62a5", + "name": "Add skbedit action with valid mark and mask exceeding 32-bit maximum", + "category": [ + "actions", + "skbedit" + ], + "setup": [ + [ + "$TC actions flush action skbedit", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action skbedit mark 1/0xaabbccddeeff112233", + "expExitCode": "255", + "verifyCmd": "$TC actions list action skbedit", + "matchPattern": "action order [0-9]*: skbedit mark 1/0xaabbccddeeff112233", + "matchCount": "0", + "teardown": [] + }, + { + "id": "bc15", + "name": "Add skbedit action with valid mark and mask with invalid format", + "category": [ + "actions", + "skbedit" + ], + "setup": [ + [ + "$TC actions flush action skbedit", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action skbedit mark 1/-1234", + "expExitCode": "255", + "verifyCmd": "$TC actions list action skbedit", + "matchPattern": "action order [0-9]*: skbedit mark 1/-1234", + "matchCount": "0", + "teardown": [] + }, + { + "id": "57c2", + "name": "Replace skbedit action with new mask", + "category": [ + "actions", + "skbedit" + ], + "setup": [ + [ + "$TC actions flush action skbedit", + 0, + 1, + 255 + ], + "$TC actions add action skbedit mark 1/0x11223344 index 1" + ], + "cmdUnderTest": "$TC actions replace action skbedit mark 1/0xaabb index 1", + "expExitCode": "0", + "verifyCmd": "$TC actions list action skbedit", + "matchPattern": "action order [0-9]*: skbedit mark 1/0xaabb", + "matchCount": "1", + "teardown": [ + "$TC actions flush action skbedit" + ] + }, { "id": "081d", "name": "Add skbedit action with priority", -- cgit v1.2.3-55-g7522 From 39443104c7d3f2b05a4a330fbcef6da68f80d60b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 18 Apr 2019 17:29:24 -0300 Subject: docs: blockdev: convert to ReST Rename the blockdev documentation files to ReST, add an index for them and adjust in order to produce a nice html output via the Sphinx build system. The drbd sub-directory contains some graphs and data flows. Add those too to the documentation. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab --- Documentation/admin-guide/kernel-parameters.txt | 18 +- Documentation/blockdev/drbd/README.txt | 16 - Documentation/blockdev/drbd/data-structure-v9.rst | 42 +++ Documentation/blockdev/drbd/data-structure-v9.txt | 38 -- Documentation/blockdev/drbd/figures.rst | 28 ++ Documentation/blockdev/drbd/index.rst | 19 + Documentation/blockdev/floppy.rst | 255 +++++++++++++ Documentation/blockdev/floppy.txt | 245 ------------ Documentation/blockdev/index.rst | 16 + Documentation/blockdev/nbd.rst | 31 ++ Documentation/blockdev/nbd.txt | 31 -- Documentation/blockdev/paride.rst | 439 ++++++++++++++++++++++ Documentation/blockdev/paride.txt | 417 -------------------- Documentation/blockdev/ramdisk.rst | 177 +++++++++ Documentation/blockdev/ramdisk.txt | 174 --------- Documentation/blockdev/zram.rst | 422 +++++++++++++++++++++ Documentation/blockdev/zram.txt | 355 ----------------- MAINTAINERS | 8 +- drivers/block/Kconfig | 8 +- drivers/block/floppy.c | 2 +- drivers/block/zram/Kconfig | 6 +- tools/testing/selftests/zram/README | 2 +- 22 files changed, 1451 insertions(+), 1298 deletions(-) delete mode 100644 Documentation/blockdev/drbd/README.txt create mode 100644 Documentation/blockdev/drbd/data-structure-v9.rst delete mode 100644 Documentation/blockdev/drbd/data-structure-v9.txt create mode 100644 Documentation/blockdev/drbd/figures.rst create mode 100644 Documentation/blockdev/drbd/index.rst create mode 100644 Documentation/blockdev/floppy.rst delete mode 100644 Documentation/blockdev/floppy.txt create mode 100644 Documentation/blockdev/index.rst create mode 100644 Documentation/blockdev/nbd.rst delete mode 100644 Documentation/blockdev/nbd.txt create mode 100644 Documentation/blockdev/paride.rst delete mode 100644 Documentation/blockdev/paride.txt create mode 100644 Documentation/blockdev/ramdisk.rst delete mode 100644 Documentation/blockdev/ramdisk.txt create mode 100644 Documentation/blockdev/zram.rst delete mode 100644 Documentation/blockdev/zram.txt (limited to 'tools/testing') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a342dd5c95a9..6b2adda1cc03 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1249,7 +1249,7 @@ See also Documentation/fault-injection/. floppy= [HW] - See Documentation/blockdev/floppy.txt. + See Documentation/blockdev/floppy.rst. force_pal_cache_flush [IA-64] Avoid check_sal_cache_flush which may hang on @@ -2234,7 +2234,7 @@ memblock=debug [KNL] Enable memblock debug messages. load_ramdisk= [RAM] List of ramdisks to load from floppy - See Documentation/blockdev/ramdisk.txt. + See Documentation/blockdev/ramdisk.rst. lockd.nlm_grace_period=P [NFS] Assign grace period. Format: @@ -3268,7 +3268,7 @@ pcd. [PARIDE] See header of drivers/block/paride/pcd.c. - See also Documentation/blockdev/paride.txt. + See also Documentation/blockdev/paride.rst. pci=option[,option...] [PCI] various PCI subsystem options. @@ -3512,7 +3512,7 @@ needed on a platform with proper driver support. pd. [PARIDE] - See Documentation/blockdev/paride.txt. + See Documentation/blockdev/paride.rst. pdcchassis= [PARISC,HW] Disable/Enable PDC Chassis Status codes at boot time. @@ -3527,10 +3527,10 @@ and performance comparison. pf. [PARIDE] - See Documentation/blockdev/paride.txt. + See Documentation/blockdev/paride.rst. pg. [PARIDE] - See Documentation/blockdev/paride.txt. + See Documentation/blockdev/paride.rst. pirq= [SMP,APIC] Manual mp-table setup See Documentation/x86/i386/IO-APIC.rst. @@ -3642,7 +3642,7 @@ prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk before loading. - See Documentation/blockdev/ramdisk.txt. + See Documentation/blockdev/ramdisk.rst. psi= [KNL] Enable or disable pressure stall information tracking. @@ -3664,7 +3664,7 @@ pstore.backend= Specify the name of the pstore backend to use pt. [PARIDE] - See Documentation/blockdev/paride.txt. + See Documentation/blockdev/paride.rst. pti= [X86_64] Control Page Table Isolation of user and kernel address spaces. Disabling this feature @@ -3693,7 +3693,7 @@ See Documentation/admin-guide/md.rst. ramdisk_size= [RAM] Sizes of RAM disks in kilobytes - See Documentation/blockdev/ramdisk.txt. + See Documentation/blockdev/ramdisk.rst. random.trust_cpu={on,off} [KNL] Enable or disable trusting the use of the diff --git a/Documentation/blockdev/drbd/README.txt b/Documentation/blockdev/drbd/README.txt deleted file mode 100644 index 627b0a1bf35e..000000000000 --- a/Documentation/blockdev/drbd/README.txt +++ /dev/null @@ -1,16 +0,0 @@ -Description - - DRBD is a shared-nothing, synchronously replicated block device. It - is designed to serve as a building block for high availability - clusters and in this context, is a "drop-in" replacement for shared - storage. Simplistically, you could see it as a network RAID 1. - - Please visit http://www.drbd.org to find out more. - -The here included files are intended to help understand the implementation - -DRBD-8.3-data-packets.svg, DRBD-data-packets.svg - relates some functions, and write packets. - -conn-states-8.dot, disk-states-8.dot, node-states-8.dot - The sub graphs of DRBD's state transitions diff --git a/Documentation/blockdev/drbd/data-structure-v9.rst b/Documentation/blockdev/drbd/data-structure-v9.rst new file mode 100644 index 000000000000..66036b901644 --- /dev/null +++ b/Documentation/blockdev/drbd/data-structure-v9.rst @@ -0,0 +1,42 @@ +================================ +kernel data structure for DRBD-9 +================================ + +This describes the in kernel data structure for DRBD-9. Starting with +Linux v3.14 we are reorganizing DRBD to use this data structure. + +Basic Data Structure +==================== + +A node has a number of DRBD resources. Each such resource has a number of +devices (aka volumes) and connections to other nodes ("peer nodes"). Each DRBD +device is represented by a block device locally. + +The DRBD objects are interconnected to form a matrix as depicted below; a +drbd_peer_device object sits at each intersection between a drbd_device and a +drbd_connection:: + + /--------------+---------------+.....+---------------\ + | resource | device | | device | + +--------------+---------------+.....+---------------+ + | connection | peer_device | | peer_device | + +--------------+---------------+.....+---------------+ + : : : : : + : : : : : + +--------------+---------------+.....+---------------+ + | connection | peer_device | | peer_device | + \--------------+---------------+.....+---------------/ + +In this table, horizontally, devices can be accessed from resources by their +volume number. Likewise, peer_devices can be accessed from connections by +their volume number. Objects in the vertical direction are connected by double +linked lists. There are back pointers from peer_devices to their connections a +devices, and from connections and devices to their resource. + +All resources are in the drbd_resources double-linked list. In addition, all +devices can be accessed by their minor device number via the drbd_devices idr. + +The drbd_resource, drbd_connection, and drbd_device objects are reference +counted. The peer_device objects only serve to establish the links between +devices and connections; their lifetime is determined by the lifetime of the +device and connection which they reference. diff --git a/Documentation/blockdev/drbd/data-structure-v9.txt b/Documentation/blockdev/drbd/data-structure-v9.txt deleted file mode 100644 index 1e52a0e32624..000000000000 --- a/Documentation/blockdev/drbd/data-structure-v9.txt +++ /dev/null @@ -1,38 +0,0 @@ -This describes the in kernel data structure for DRBD-9. Starting with -Linux v3.14 we are reorganizing DRBD to use this data structure. - -Basic Data Structure -==================== - -A node has a number of DRBD resources. Each such resource has a number of -devices (aka volumes) and connections to other nodes ("peer nodes"). Each DRBD -device is represented by a block device locally. - -The DRBD objects are interconnected to form a matrix as depicted below; a -drbd_peer_device object sits at each intersection between a drbd_device and a -drbd_connection: - - /--------------+---------------+.....+---------------\ - | resource | device | | device | - +--------------+---------------+.....+---------------+ - | connection | peer_device | | peer_device | - +--------------+---------------+.....+---------------+ - : : : : : - : : : : : - +--------------+---------------+.....+---------------+ - | connection | peer_device | | peer_device | - \--------------+---------------+.....+---------------/ - -In this table, horizontally, devices can be accessed from resources by their -volume number. Likewise, peer_devices can be accessed from connections by -their volume number. Objects in the vertical direction are connected by double -linked lists. There are back pointers from peer_devices to their connections a -devices, and from connections and devices to their resource. - -All resources are in the drbd_resources double-linked list. In addition, all -devices can be accessed by their minor device number via the drbd_devices idr. - -The drbd_resource, drbd_connection, and drbd_device objects are reference -counted. The peer_device objects only serve to establish the links between -devices and connections; their lifetime is determined by the lifetime of the -device and connection which they reference. diff --git a/Documentation/blockdev/drbd/figures.rst b/Documentation/blockdev/drbd/figures.rst new file mode 100644 index 000000000000..3e3fd4b8a478 --- /dev/null +++ b/Documentation/blockdev/drbd/figures.rst @@ -0,0 +1,28 @@ +.. The here included files are intended to help understand the implementation + +Data flows that Relate some functions, and write packets +======================================================== + +.. kernel-figure:: DRBD-8.3-data-packets.svg + :alt: DRBD-8.3-data-packets.svg + :align: center + +.. kernel-figure:: DRBD-data-packets.svg + :alt: DRBD-data-packets.svg + :align: center + + +Sub graphs of DRBD's state transitions +====================================== + +.. kernel-figure:: conn-states-8.dot + :alt: conn-states-8.dot + :align: center + +.. kernel-figure:: disk-states-8.dot + :alt: disk-states-8.dot + :align: center + +.. kernel-figure:: node-states-8.dot + :alt: node-states-8.dot + :align: center diff --git a/Documentation/blockdev/drbd/index.rst b/Documentation/blockdev/drbd/index.rst new file mode 100644 index 000000000000..68ecd5c113e9 --- /dev/null +++ b/Documentation/blockdev/drbd/index.rst @@ -0,0 +1,19 @@ +========================================== +Distributed Replicated Block Device - DRBD +========================================== + +Description +=========== + + DRBD is a shared-nothing, synchronously replicated block device. It + is designed to serve as a building block for high availability + clusters and in this context, is a "drop-in" replacement for shared + storage. Simplistically, you could see it as a network RAID 1. + + Please visit http://www.drbd.org to find out more. + +.. toctree:: + :maxdepth: 1 + + data-structure-v9 + figures diff --git a/Documentation/blockdev/floppy.rst b/Documentation/blockdev/floppy.rst new file mode 100644 index 000000000000..4a8f31cf4139 --- /dev/null +++ b/Documentation/blockdev/floppy.rst @@ -0,0 +1,255 @@ +============= +Floppy Driver +============= + +FAQ list: +========= + +A FAQ list may be found in the fdutils package (see below), and also +at . + + +LILO configuration options (Thinkpad users, read this) +====================================================== + +The floppy driver is configured using the 'floppy=' option in +lilo. This option can be typed at the boot prompt, or entered in the +lilo configuration file. + +Example: If your kernel is called linux-2.6.9, type the following line +at the lilo boot prompt (if you have a thinkpad):: + + linux-2.6.9 floppy=thinkpad + +You may also enter the following line in /etc/lilo.conf, in the description +of linux-2.6.9:: + + append = "floppy=thinkpad" + +Several floppy related options may be given, example:: + + linux-2.6.9 floppy=daring floppy=two_fdc + append = "floppy=daring floppy=two_fdc" + +If you give options both in the lilo config file and on the boot +prompt, the option strings of both places are concatenated, the boot +prompt options coming last. That's why there are also options to +restore the default behavior. + + +Module configuration options +============================ + +If you use the floppy driver as a module, use the following syntax:: + + modprobe floppy floppy="" + +Example:: + + modprobe floppy floppy="omnibook messages" + +If you need certain options enabled every time you load the floppy driver, +you can put:: + + options floppy floppy="omnibook messages" + +in a configuration file in /etc/modprobe.d/. + + +The floppy driver related options are: + + floppy=asus_pci + Sets the bit mask to allow only units 0 and 1. (default) + + floppy=daring + Tells the floppy driver that you have a well behaved floppy controller. + This allows more efficient and smoother operation, but may fail on + certain controllers. This may speed up certain operations. + + floppy=0,daring + Tells the floppy driver that your floppy controller should be used + with caution. + + floppy=one_fdc + Tells the floppy driver that you have only one floppy controller. + (default) + + floppy=two_fdc / floppy=
,two_fdc + Tells the floppy driver that you have two floppy controllers. + The second floppy controller is assumed to be at
. + This option is not needed if the second controller is at address + 0x370, and if you use the 'cmos' option. + + floppy=thinkpad + Tells the floppy driver that you have a Thinkpad. Thinkpads use an + inverted convention for the disk change line. + + floppy=0,thinkpad + Tells the floppy driver that you don't have a Thinkpad. + + floppy=omnibook / floppy=nodma + Tells the floppy driver not to use Dma for data transfers. + This is needed on HP Omnibooks, which don't have a workable + DMA channel for the floppy driver. This option is also useful + if you frequently get "Unable to allocate DMA memory" messages. + Indeed, dma memory needs to be continuous in physical memory, + and is thus harder to find, whereas non-dma buffers may be + allocated in virtual memory. However, I advise against this if + you have an FDC without a FIFO (8272A or 82072). 82072A and + later are OK. You also need at least a 486 to use nodma. + If you use nodma mode, I suggest you also set the FIFO + threshold to 10 or lower, in order to limit the number of data + transfer interrupts. + + If you have a FIFO-able FDC, the floppy driver automatically + falls back on non DMA mode if no DMA-able memory can be found. + If you want to avoid this, explicitly ask for 'yesdma'. + + floppy=yesdma + Tells the floppy driver that a workable DMA channel is available. + (default) + + floppy=nofifo + Disables the FIFO entirely. This is needed if you get "Bus + master arbitration error" messages from your Ethernet card (or + from other devices) while accessing the floppy. + + floppy=usefifo + Enables the FIFO. (default) + + floppy=,fifo_depth + Sets the FIFO threshold. This is mostly relevant in DMA + mode. If this is higher, the floppy driver tolerates more + interrupt latency, but it triggers more interrupts (i.e. it + imposes more load on the rest of the system). If this is + lower, the interrupt latency should be lower too (faster + processor). The benefit of a lower threshold is less + interrupts. + + To tune the fifo threshold, switch on over/underrun messages + using 'floppycontrol --messages'. Then access a floppy + disk. If you get a huge amount of "Over/Underrun - retrying" + messages, then the fifo threshold is too low. Try with a + higher value, until you only get an occasional Over/Underrun. + It is a good idea to compile the floppy driver as a module + when doing this tuning. Indeed, it allows to try different + fifo values without rebooting the machine for each test. Note + that you need to do 'floppycontrol --messages' every time you + re-insert the module. + + Usually, tuning the fifo threshold should not be needed, as + the default (0xa) is reasonable. + + floppy=,,cmos + Sets the CMOS type of to . This is mandatory if + you have more than two floppy drives (only two can be + described in the physical CMOS), or if your BIOS uses + non-standard CMOS types. The CMOS types are: + + == ================================== + 0 Use the value of the physical CMOS + 1 5 1/4 DD + 2 5 1/4 HD + 3 3 1/2 DD + 4 3 1/2 HD + 5 3 1/2 ED + 6 3 1/2 ED + 16 unknown or not installed + == ================================== + + (Note: there are two valid types for ED drives. This is because 5 was + initially chosen to represent floppy *tapes*, and 6 for ED drives. + AMI ignored this, and used 5 for ED drives. That's why the floppy + driver handles both.) + + floppy=unexpected_interrupts + Print a warning message when an unexpected interrupt is received. + (default) + + floppy=no_unexpected_interrupts / floppy=L40SX + Don't print a message when an unexpected interrupt is received. This + is needed on IBM L40SX laptops in certain video modes. (There seems + to be an interaction between video and floppy. The unexpected + interrupts affect only performance, and can be safely ignored.) + + floppy=broken_dcl + Don't use the disk change line, but assume that the disk was + changed whenever the device node is reopened. Needed on some + boxes where the disk change line is broken or unsupported. + This should be regarded as a stopgap measure, indeed it makes + floppy operation less efficient due to unneeded cache + flushings, and slightly more unreliable. Please verify your + cable, connection and jumper settings if you have any DCL + problems. However, some older drives, and also some laptops + are known not to have a DCL. + + floppy=debug + Print debugging messages. + + floppy=messages + Print informational messages for some operations (disk change + notifications, warnings about over and underruns, and about + autodetection). + + floppy=silent_dcl_clear + Uses a less noisy way to clear the disk change line (which + doesn't involve seeks). Implied by 'daring' option. + + floppy=,irq + Sets the floppy IRQ to instead of 6. + + floppy=,dma + Sets the floppy DMA channel to instead of 2. + + floppy=slow + Use PS/2 stepping rate:: + + PS/2 floppies have much slower step rates than regular floppies. + It's been recommended that take about 1/4 of the default speed + in some more extreme cases. + + +Supporting utilities and additional documentation: +================================================== + +Additional parameters of the floppy driver can be configured at +runtime. Utilities which do this can be found in the fdutils package. +This package also contains a new version of mtools which allows to +access high capacity disks (up to 1992K on a high density 3 1/2 disk!). +It also contains additional documentation about the floppy driver. + +The latest version can be found at fdutils homepage: + + http://fdutils.linux.lu + +The fdutils releases can be found at: + + http://fdutils.linux.lu/download.html + + http://www.tux.org/pub/knaff/fdutils/ + + ftp://metalab.unc.edu/pub/Linux/utils/disk-management/ + +Reporting problems about the floppy driver +========================================== + +If you have a question or a bug report about the floppy driver, mail +me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use +comp.os.linux.hardware. As the volume in these groups is rather high, +be sure to include the word "floppy" (or "FLOPPY") in the subject +line. If the reported problem happens when mounting floppy disks, be +sure to mention also the type of the filesystem in the subject line. + +Be sure to read the FAQ before mailing/posting any bug reports! + +Alain + +Changelog +========= + +10-30-2004 : + Cleanup, updating, add reference to module configuration. + James Nelson + +6-3-2000 : + Original Document diff --git a/Documentation/blockdev/floppy.txt b/Documentation/blockdev/floppy.txt deleted file mode 100644 index e2240f5ab64d..000000000000 --- a/Documentation/blockdev/floppy.txt +++ /dev/null @@ -1,245 +0,0 @@ -This file describes the floppy driver. - -FAQ list: -========= - - A FAQ list may be found in the fdutils package (see below), and also -at . - - -LILO configuration options (Thinkpad users, read this) -====================================================== - - The floppy driver is configured using the 'floppy=' option in -lilo. This option can be typed at the boot prompt, or entered in the -lilo configuration file. - - Example: If your kernel is called linux-2.6.9, type the following line -at the lilo boot prompt (if you have a thinkpad): - - linux-2.6.9 floppy=thinkpad - -You may also enter the following line in /etc/lilo.conf, in the description -of linux-2.6.9: - - append = "floppy=thinkpad" - - Several floppy related options may be given, example: - - linux-2.6.9 floppy=daring floppy=two_fdc - append = "floppy=daring floppy=two_fdc" - - If you give options both in the lilo config file and on the boot -prompt, the option strings of both places are concatenated, the boot -prompt options coming last. That's why there are also options to -restore the default behavior. - - -Module configuration options -============================ - - If you use the floppy driver as a module, use the following syntax: -modprobe floppy floppy="" - -Example: - modprobe floppy floppy="omnibook messages" - - If you need certain options enabled every time you load the floppy driver, -you can put: - - options floppy floppy="omnibook messages" - -in a configuration file in /etc/modprobe.d/. - - - The floppy driver related options are: - - floppy=asus_pci - Sets the bit mask to allow only units 0 and 1. (default) - - floppy=daring - Tells the floppy driver that you have a well behaved floppy controller. - This allows more efficient and smoother operation, but may fail on - certain controllers. This may speed up certain operations. - - floppy=0,daring - Tells the floppy driver that your floppy controller should be used - with caution. - - floppy=one_fdc - Tells the floppy driver that you have only one floppy controller. - (default) - - floppy=two_fdc - floppy=
,two_fdc - Tells the floppy driver that you have two floppy controllers. - The second floppy controller is assumed to be at
. - This option is not needed if the second controller is at address - 0x370, and if you use the 'cmos' option. - - floppy=thinkpad - Tells the floppy driver that you have a Thinkpad. Thinkpads use an - inverted convention for the disk change line. - - floppy=0,thinkpad - Tells the floppy driver that you don't have a Thinkpad. - - floppy=omnibook - floppy=nodma - Tells the floppy driver not to use Dma for data transfers. - This is needed on HP Omnibooks, which don't have a workable - DMA channel for the floppy driver. This option is also useful - if you frequently get "Unable to allocate DMA memory" messages. - Indeed, dma memory needs to be continuous in physical memory, - and is thus harder to find, whereas non-dma buffers may be - allocated in virtual memory. However, I advise against this if - you have an FDC without a FIFO (8272A or 82072). 82072A and - later are OK. You also need at least a 486 to use nodma. - If you use nodma mode, I suggest you also set the FIFO - threshold to 10 or lower, in order to limit the number of data - transfer interrupts. - - If you have a FIFO-able FDC, the floppy driver automatically - falls back on non DMA mode if no DMA-able memory can be found. - If you want to avoid this, explicitly ask for 'yesdma'. - - floppy=yesdma - Tells the floppy driver that a workable DMA channel is available. - (default) - - floppy=nofifo - Disables the FIFO entirely. This is needed if you get "Bus - master arbitration error" messages from your Ethernet card (or - from other devices) while accessing the floppy. - - floppy=usefifo - Enables the FIFO. (default) - - floppy=,fifo_depth - Sets the FIFO threshold. This is mostly relevant in DMA - mode. If this is higher, the floppy driver tolerates more - interrupt latency, but it triggers more interrupts (i.e. it - imposes more load on the rest of the system). If this is - lower, the interrupt latency should be lower too (faster - processor). The benefit of a lower threshold is less - interrupts. - - To tune the fifo threshold, switch on over/underrun messages - using 'floppycontrol --messages'. Then access a floppy - disk. If you get a huge amount of "Over/Underrun - retrying" - messages, then the fifo threshold is too low. Try with a - higher value, until you only get an occasional Over/Underrun. - It is a good idea to compile the floppy driver as a module - when doing this tuning. Indeed, it allows to try different - fifo values without rebooting the machine for each test. Note - that you need to do 'floppycontrol --messages' every time you - re-insert the module. - - Usually, tuning the fifo threshold should not be needed, as - the default (0xa) is reasonable. - - floppy=,,cmos - Sets the CMOS type of to . This is mandatory if - you have more than two floppy drives (only two can be - described in the physical CMOS), or if your BIOS uses - non-standard CMOS types. The CMOS types are: - - 0 - Use the value of the physical CMOS - 1 - 5 1/4 DD - 2 - 5 1/4 HD - 3 - 3 1/2 DD - 4 - 3 1/2 HD - 5 - 3 1/2 ED - 6 - 3 1/2 ED - 16 - unknown or not installed - - (Note: there are two valid types for ED drives. This is because 5 was - initially chosen to represent floppy *tapes*, and 6 for ED drives. - AMI ignored this, and used 5 for ED drives. That's why the floppy - driver handles both.) - - floppy=unexpected_interrupts - Print a warning message when an unexpected interrupt is received. - (default) - - floppy=no_unexpected_interrupts - floppy=L40SX - Don't print a message when an unexpected interrupt is received. This - is needed on IBM L40SX laptops in certain video modes. (There seems - to be an interaction between video and floppy. The unexpected - interrupts affect only performance, and can be safely ignored.) - - floppy=broken_dcl - Don't use the disk change line, but assume that the disk was - changed whenever the device node is reopened. Needed on some - boxes where the disk change line is broken or unsupported. - This should be regarded as a stopgap measure, indeed it makes - floppy operation less efficient due to unneeded cache - flushings, and slightly more unreliable. Please verify your - cable, connection and jumper settings if you have any DCL - problems. However, some older drives, and also some laptops - are known not to have a DCL. - - floppy=debug - Print debugging messages. - - floppy=messages - Print informational messages for some operations (disk change - notifications, warnings about over and underruns, and about - autodetection). - - floppy=silent_dcl_clear - Uses a less noisy way to clear the disk change line (which - doesn't involve seeks). Implied by 'daring' option. - - floppy=,irq - Sets the floppy IRQ to instead of 6. - - floppy=,dma - Sets the floppy DMA channel to instead of 2. - - floppy=slow - Use PS/2 stepping rate: - " PS/2 floppies have much slower step rates than regular floppies. - It's been recommended that take about 1/4 of the default speed - in some more extreme cases." - - -Supporting utilities and additional documentation: -================================================== - - Additional parameters of the floppy driver can be configured at -runtime. Utilities which do this can be found in the fdutils package. -This package also contains a new version of mtools which allows to -access high capacity disks (up to 1992K on a high density 3 1/2 disk!). -It also contains additional documentation about the floppy driver. - -The latest version can be found at fdutils homepage: - http://fdutils.linux.lu - -The fdutils releases can be found at: - http://fdutils.linux.lu/download.html - http://www.tux.org/pub/knaff/fdutils/ - ftp://metalab.unc.edu/pub/Linux/utils/disk-management/ - -Reporting problems about the floppy driver -========================================== - - If you have a question or a bug report about the floppy driver, mail -me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use -comp.os.linux.hardware. As the volume in these groups is rather high, -be sure to include the word "floppy" (or "FLOPPY") in the subject -line. If the reported problem happens when mounting floppy disks, be -sure to mention also the type of the filesystem in the subject line. - - Be sure to read the FAQ before mailing/posting any bug reports! - - Alain - -Changelog -========= - -10-30-2004 : Cleanup, updating, add reference to module configuration. - James Nelson - -6-3-2000 : Original Document diff --git a/Documentation/blockdev/index.rst b/Documentation/blockdev/index.rst new file mode 100644 index 000000000000..a9af6ed8b4aa --- /dev/null +++ b/Documentation/blockdev/index.rst @@ -0,0 +1,16 @@ +:orphan: + +=========================== +The Linux RapidIO Subsystem +=========================== + +.. toctree:: + :maxdepth: 1 + + floppy + nbd + paride + ramdisk + zram + + drbd/index diff --git a/Documentation/blockdev/nbd.rst b/Documentation/blockdev/nbd.rst new file mode 100644 index 000000000000..d78dfe559dcf --- /dev/null +++ b/Documentation/blockdev/nbd.rst @@ -0,0 +1,31 @@ +================================== +Network Block Device (TCP version) +================================== + +1) Overview +----------- + +What is it: With this compiled in the kernel (or as a module), Linux +can use a remote server as one of its block devices. So every time +the client computer wants to read, e.g., /dev/nb0, it sends a +request over TCP to the server, which will reply with the data read. +This can be used for stations with low disk space (or even diskless) +to borrow disk space from another computer. +Unlike NFS, it is possible to put any filesystem on it, etc. + +For more information, or to download the nbd-client and nbd-server +tools, go to http://nbd.sf.net/. + +The nbd kernel module need only be installed on the client +system, as the nbd-server is completely in userspace. In fact, +the nbd-server has been successfully ported to other operating +systems, including Windows. + +A) NBD parameters +----------------- + +max_part + Number of partitions per device (default: 0). + +nbds_max + Number of block devices that should be initialized (default: 16). diff --git a/Documentation/blockdev/nbd.txt b/Documentation/blockdev/nbd.txt deleted file mode 100644 index db242ea2bce8..000000000000 --- a/Documentation/blockdev/nbd.txt +++ /dev/null @@ -1,31 +0,0 @@ -Network Block Device (TCP version) -================================== - -1) Overview ------------ - -What is it: With this compiled in the kernel (or as a module), Linux -can use a remote server as one of its block devices. So every time -the client computer wants to read, e.g., /dev/nb0, it sends a -request over TCP to the server, which will reply with the data read. -This can be used for stations with low disk space (or even diskless) -to borrow disk space from another computer. -Unlike NFS, it is possible to put any filesystem on it, etc. - -For more information, or to download the nbd-client and nbd-server -tools, go to http://nbd.sf.net/. - -The nbd kernel module need only be installed on the client -system, as the nbd-server is completely in userspace. In fact, -the nbd-server has been successfully ported to other operating -systems, including Windows. - -A) NBD parameters ------------------ - -max_part - Number of partitions per device (default: 0). - -nbds_max - Number of block devices that should be initialized (default: 16). - diff --git a/Documentation/blockdev/paride.rst b/Documentation/blockdev/paride.rst new file mode 100644 index 000000000000..87b4278bf314 --- /dev/null +++ b/Documentation/blockdev/paride.rst @@ -0,0 +1,439 @@ +=================================== +Linux and parallel port IDE devices +=================================== + +PARIDE v1.03 (c) 1997-8 Grant Guenther + +1. Introduction +=============== + +Owing to the simplicity and near universality of the parallel port interface +to personal computers, many external devices such as portable hard-disk, +CD-ROM, LS-120 and tape drives use the parallel port to connect to their +host computer. While some devices (notably scanners) use ad-hoc methods +to pass commands and data through the parallel port interface, most +external devices are actually identical to an internal model, but with +a parallel-port adapter chip added in. Some of the original parallel port +adapters were little more than mechanisms for multiplexing a SCSI bus. +(The Iomega PPA-3 adapter used in the ZIP drives is an example of this +approach). Most current designs, however, take a different approach. +The adapter chip reproduces a small ISA or IDE bus in the external device +and the communication protocol provides operations for reading and writing +device registers, as well as data block transfer functions. Sometimes, +the device being addressed via the parallel cable is a standard SCSI +controller like an NCR 5380. The "ditto" family of external tape +drives use the ISA replicator to interface a floppy disk controller, +which is then connected to a floppy-tape mechanism. The vast majority +of external parallel port devices, however, are now based on standard +IDE type devices, which require no intermediate controller. If one +were to open up a parallel port CD-ROM drive, for instance, one would +find a standard ATAPI CD-ROM drive, a power supply, and a single adapter +that interconnected a standard PC parallel port cable and a standard +IDE cable. It is usually possible to exchange the CD-ROM device with +any other device using the IDE interface. + +The document describes the support in Linux for parallel port IDE +devices. It does not cover parallel port SCSI devices, "ditto" tape +drives or scanners. Many different devices are supported by the +parallel port IDE subsystem, including: + + - MicroSolutions backpack CD-ROM + - MicroSolutions backpack PD/CD + - MicroSolutions backpack hard-drives + - MicroSolutions backpack 8000t tape drive + - SyQuest EZ-135, EZ-230 & SparQ drives + - Avatar Shark + - Imation Superdisk LS-120 + - Maxell Superdisk LS-120 + - FreeCom Power CD + - Hewlett-Packard 5GB and 8GB tape drives + - Hewlett-Packard 7100 and 7200 CD-RW drives + +as well as most of the clone and no-name products on the market. + +To support such a wide range of devices, PARIDE, the parallel port IDE +subsystem, is actually structured in three parts. There is a base +paride module which provides a registry and some common methods for +accessing the parallel ports. The second component is a set of +high-level drivers for each of the different types of supported devices: + + === ============= + pd IDE disk + pcd ATAPI CD-ROM + pf ATAPI disk + pt ATAPI tape + pg ATAPI generic + === ============= + +(Currently, the pg driver is only used with CD-R drives). + +The high-level drivers function according to the relevant standards. +The third component of PARIDE is a set of low-level protocol drivers +for each of the parallel port IDE adapter chips. Thanks to the interest +and encouragement of Linux users from many parts of the world, +support is available for almost all known adapter protocols: + + ==== ====================================== ==== + aten ATEN EH-100 (HK) + bpck Microsolutions backpack (US) + comm DataStor (old-type) "commuter" adapter (TW) + dstr DataStor EP-2000 (TW) + epat Shuttle EPAT (UK) + epia Shuttle EPIA (UK) + fit2 FIT TD-2000 (US) + fit3 FIT TD-3000 (US) + friq Freecom IQ cable (DE) + frpw Freecom Power (DE) + kbic KingByte KBIC-951A and KBIC-971A (TW) + ktti KT Technology PHd adapter (SG) + on20 OnSpec 90c20 (US) + on26 OnSpec 90c26 (US) + ==== ====================================== ==== + + +2. Using the PARIDE subsystem +============================= + +While configuring the Linux kernel, you may choose either to build +the PARIDE drivers into your kernel, or to build them as modules. + +In either case, you will need to select "Parallel port IDE device support" +as well as at least one of the high-level drivers and at least one +of the parallel port communication protocols. If you do not know +what kind of parallel port adapter is used in your drive, you could +begin by checking the file names and any text files on your DOS +installation floppy. Alternatively, you can look at the markings on +the adapter chip itself. That's usually sufficient to identify the +correct device. + +You can actually select all the protocol modules, and allow the PARIDE +subsystem to try them all for you. + +For the "brand-name" products listed above, here are the protocol +and high-level drivers that you would use: + + ================ ============ ====== ======== + Manufacturer Model Driver Protocol + ================ ============ ====== ======== + MicroSolutions CD-ROM pcd bpck + MicroSolutions PD drive pf bpck + MicroSolutions hard-drive pd bpck + MicroSolutions 8000t tape pt bpck + SyQuest EZ, SparQ pd epat + Imation Superdisk pf epat + Maxell Superdisk pf friq + Avatar Shark pd epat + FreeCom CD-ROM pcd frpw + Hewlett-Packard 5GB Tape pt epat + Hewlett-Packard 7200e (CD) pcd epat + Hewlett-Packard 7200e (CD-R) pg epat + ================ ============ ====== ======== + +2.1 Configuring built-in drivers +--------------------------------- + +We recommend that you get to know how the drivers work and how to +configure them as loadable modules, before attempting to compile a +kernel with the drivers built-in. + +If you built all of your PARIDE support directly into your kernel, +and you have just a single parallel port IDE device, your kernel should +locate it automatically for you. If you have more than one device, +you may need to give some command line options to your bootloader +(eg: LILO), how to do that is beyond the scope of this document. + +The high-level drivers accept a number of command line parameters, all +of which are documented in the source files in linux/drivers/block/paride. +By default, each driver will automatically try all parallel ports it +can find, and all protocol types that have been installed, until it finds +a parallel port IDE adapter. Once it finds one, the probe stops. So, +if you have more than one device, you will need to tell the drivers +how to identify them. This requires specifying the port address, the +protocol identification number and, for some devices, the drive's +chain ID. While your system is booting, a number of messages are +displayed on the console. Like all such messages, they can be +reviewed with the 'dmesg' command. Among those messages will be +some lines like:: + + paride: bpck registered as protocol 0 + paride: epat registered as protocol 1 + +The numbers will always be the same until you build a new kernel with +different protocol selections. You should note these numbers as you +will need them to identify the devices. + +If you happen to be using a MicroSolutions backpack device, you will +also need to know the unit ID number for each drive. This is usually +the last two digits of the drive's serial number (but read MicroSolutions' +documentation about this). + +As an example, let's assume that you have a MicroSolutions PD/CD drive +with unit ID number 36 connected to the parallel port at 0x378, a SyQuest +EZ-135 connected to the chained port on the PD/CD drive and also an +Imation Superdisk connected to port 0x278. You could give the following +options on your boot command:: + + pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36 + +In the last option, pf.drive1 configures device /dev/pf1, the 0x378 +is the parallel port base address, the 0 is the protocol registration +number and 36 is the chain ID. + +Please note: while PARIDE will work both with and without the +PARPORT parallel port sharing system that is included by the +"Parallel port support" option, PARPORT must be included and enabled +if you want to use chains of devices on the same parallel port. + +2.2 Loading and configuring PARIDE as modules +---------------------------------------------- + +It is much faster and simpler to get to understand the PARIDE drivers +if you use them as loadable kernel modules. + +Note 1: + using these drivers with the "kerneld" automatic module loading + system is not recommended for beginners, and is not documented here. + +Note 2: + if you build PARPORT support as a loadable module, PARIDE must + also be built as loadable modules, and PARPORT must be loaded before + the PARIDE modules. + +To use PARIDE, you must begin by:: + + insmod paride + +this loads a base module which provides a registry for the protocols, +among other tasks. + +Then, load as many of the protocol modules as you think you might need. +As you load each module, it will register the protocols that it supports, +and print a log message to your kernel log file and your console. For +example:: + + # insmod epat + paride: epat registered as protocol 0 + # insmod kbic + paride: k951 registered as protocol 1 + paride: k971 registered as protocol 2 + +Finally, you can load high-level drivers for each kind of device that +you have connected. By default, each driver will autoprobe for a single +device, but you can support up to four similar devices by giving their +individual co-ordinates when you load the driver. + +For example, if you had two no-name CD-ROM drives both using the +KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc +you could give the following command:: + + # insmod pcd drive0=0x378,1 drive1=0x3bc,1 + +For most adapters, giving a port address and protocol number is sufficient, +but check the source files in linux/drivers/block/paride for more +information. (Hopefully someone will write some man pages one day !). + +As another example, here's what happens when PARPORT is installed, and +a SyQuest EZ-135 is attached to port 0x378:: + + # insmod paride + paride: version 1.0 installed + # insmod epat + paride: epat registered as protocol 0 + # insmod pd + pd: pd version 1.0, major 45, cluster 64, nice 0 + pda: Sharing parport1 at 0x378 + pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1 + pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media + pda: pda1 + +Note that the last line is the output from the generic partition table +scanner - in this case it reports that it has found a disk with one partition. + +2.3 Using a PARIDE device +-------------------------- + +Once the drivers have been loaded, you can access PARIDE devices in the +same way as their traditional counterparts. You will probably need to +create the device "special files". Here is a simple script that you can +cut to a file and execute:: + + #!/bin/bash + # + # mkd -- a script to create the device special files for the PARIDE subsystem + # + function mkdev { + mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1 + } + # + function pd { + D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) ) + mkdev pd$D b 45 $[ $1 * 16 ] + for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + do mkdev pd$D$P b 45 $[ $1 * 16 + $P ] + done + } + # + cd /dev + # + for u in 0 1 2 3 ; do pd $u ; done + for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done + for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done + for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done + for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done + for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done + # + # end of mkd + +With the device files and drivers in place, you can access PARIDE devices +like any other Linux device. For example, to mount a CD-ROM in pcd0, use:: + + mount /dev/pcd0 /cdrom + +If you have a fresh Avatar Shark cartridge, and the drive is pda, you +might do something like:: + + fdisk /dev/pda -- make a new partition table with + partition 1 of type 83 + + mke2fs /dev/pda1 -- to build the file system + + mkdir /shark -- make a place to mount the disk + + mount /dev/pda1 /shark + +Devices like the Imation superdisk work in the same way, except that +they do not have a partition table. For example to make a 120MB +floppy that you could share with a DOS system:: + + mkdosfs /dev/pf0 + mount /dev/pf0 /mnt + + +2.4 The pf driver +------------------ + +The pf driver is intended for use with parallel port ATAPI disk +devices. The most common devices in this category are PD drives +and LS-120 drives. Traditionally, media for these devices are not +partitioned. Consequently, the pf driver does not support partitioned +media. This may be changed in a future version of the driver. + +2.5 Using the pt driver +------------------------ + +The pt driver for parallel port ATAPI tape drives is a minimal driver. +It does not yet support many of the standard tape ioctl operations. +For best performance, a block size of 32KB should be used. You will +probably want to set the parallel port delay to 0, if you can. + +2.6 Using the pg driver +------------------------ + +The pg driver can be used in conjunction with the cdrecord program +to create CD-ROMs. Please get cdrecord version 1.6.1 or later +from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media +your parallel port should ideally be set to EPP mode, and the "port delay" +should be set to 0. With those settings it is possible to record at 2x +speed without any buffer underruns. If you cannot get the driver to work +in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only. + + +3. Troubleshooting +================== + +3.1 Use EPP mode if you can +---------------------------- + +The most common problems that people report with the PARIDE drivers +concern the parallel port CMOS settings. At this time, none of the +PARIDE protocol modules support ECP mode, or any ECP combination modes. +If you are able to do so, please set your parallel port into EPP mode +using your CMOS setup procedure. + +3.2 Check the port delay +------------------------- + +Some parallel ports cannot reliably transfer data at full speed. To +offset the errors, the PARIDE protocol modules introduce a "port +delay" between each access to the i/o ports. Each protocol sets +a default value for this delay. In most cases, the user can override +the default and set it to 0 - resulting in somewhat higher transfer +rates. In some rare cases (especially with older 486 systems) the +default delays are not long enough. if you experience corrupt data +transfers, or unexpected failures, you may wish to increase the +port delay. The delay can be programmed using the "driveN" parameters +to each of the high-level drivers. Please see the notes above, or +read the comments at the beginning of the driver source files in +linux/drivers/block/paride. + +3.3 Some drives need a printer reset +------------------------------------- + +There appear to be a number of "noname" external drives on the market +that do not always power up correctly. We have noticed this with some +drives based on OnSpec and older Freecom adapters. In these rare cases, +the adapter can often be reinitialised by issuing a "printer reset" on +the parallel port. As the reset operation is potentially disruptive in +multiple device environments, the PARIDE drivers will not do it +automatically. You can however, force a printer reset by doing:: + + insmod lp reset=1 + rmmod lp + +If you have one of these marginal cases, you should probably build +your paride drivers as modules, and arrange to do the printer reset +before loading the PARIDE drivers. + +3.4 Use the verbose option and dmesg if you need help +------------------------------------------------------ + +While a lot of testing has gone into these drivers to make them work +as smoothly as possible, problems will arise. If you do have problems, +please check all the obvious things first: does the drive work in +DOS with the manufacturer's drivers ? If that doesn't yield any useful +clues, then please make sure that only one drive is hooked to your system, +and that either (a) PARPORT is enabled or (b) no other device driver +is using your parallel port (check in /proc/ioports). Then, load the +appropriate drivers (you can load several protocol modules if you want) +as in:: + + # insmod paride + # insmod epat + # insmod bpck + # insmod kbic + ... + # insmod pd verbose=1 + +(using the correct driver for the type of device you have, of course). +The verbose=1 parameter will cause the drivers to log a trace of their +activity as they attempt to locate your drive. + +Use 'dmesg' to capture a log of all the PARIDE messages (any messages +beginning with paride:, a protocol module's name or a driver's name) and +include that with your bug report. You can submit a bug report in one +of two ways. Either send it directly to the author of the PARIDE suite, +by e-mail to grant@torque.net, or join the linux-parport mailing list +and post your report there. + +3.5 For more information or help +--------------------------------- + +You can join the linux-parport mailing list by sending a mail message +to: + + linux-parport-request@torque.net + +with the single word:: + + subscribe + +in the body of the mail message (not in the subject line). Please be +sure that your mail program is correctly set up when you do this, as +the list manager is a robot that will subscribe you using the reply +address in your mail headers. REMOVE any anti-spam gimmicks you may +have in your mail headers, when sending mail to the list server. + +You might also find some useful information on the linux-parport +web pages (although they are not always up to date) at + + http://web.archive.org/web/%2E/http://www.torque.net/parport/ diff --git a/Documentation/blockdev/paride.txt b/Documentation/blockdev/paride.txt deleted file mode 100644 index ee6717e3771d..000000000000 --- a/Documentation/blockdev/paride.txt +++ /dev/null @@ -1,417 +0,0 @@ - - Linux and parallel port IDE devices - -PARIDE v1.03 (c) 1997-8 Grant Guenther - -1. Introduction - -Owing to the simplicity and near universality of the parallel port interface -to personal computers, many external devices such as portable hard-disk, -CD-ROM, LS-120 and tape drives use the parallel port to connect to their -host computer. While some devices (notably scanners) use ad-hoc methods -to pass commands and data through the parallel port interface, most -external devices are actually identical to an internal model, but with -a parallel-port adapter chip added in. Some of the original parallel port -adapters were little more than mechanisms for multiplexing a SCSI bus. -(The Iomega PPA-3 adapter used in the ZIP drives is an example of this -approach). Most current designs, however, take a different approach. -The adapter chip reproduces a small ISA or IDE bus in the external device -and the communication protocol provides operations for reading and writing -device registers, as well as data block transfer functions. Sometimes, -the device being addressed via the parallel cable is a standard SCSI -controller like an NCR 5380. The "ditto" family of external tape -drives use the ISA replicator to interface a floppy disk controller, -which is then connected to a floppy-tape mechanism. The vast majority -of external parallel port devices, however, are now based on standard -IDE type devices, which require no intermediate controller. If one -were to open up a parallel port CD-ROM drive, for instance, one would -find a standard ATAPI CD-ROM drive, a power supply, and a single adapter -that interconnected a standard PC parallel port cable and a standard -IDE cable. It is usually possible to exchange the CD-ROM device with -any other device using the IDE interface. - -The document describes the support in Linux for parallel port IDE -devices. It does not cover parallel port SCSI devices, "ditto" tape -drives or scanners. Many different devices are supported by the -parallel port IDE subsystem, including: - - MicroSolutions backpack CD-ROM - MicroSolutions backpack PD/CD - MicroSolutions backpack hard-drives - MicroSolutions backpack 8000t tape drive - SyQuest EZ-135, EZ-230 & SparQ drives - Avatar Shark - Imation Superdisk LS-120 - Maxell Superdisk LS-120 - FreeCom Power CD - Hewlett-Packard 5GB and 8GB tape drives - Hewlett-Packard 7100 and 7200 CD-RW drives - -as well as most of the clone and no-name products on the market. - -To support such a wide range of devices, PARIDE, the parallel port IDE -subsystem, is actually structured in three parts. There is a base -paride module which provides a registry and some common methods for -accessing the parallel ports. The second component is a set of -high-level drivers for each of the different types of supported devices: - - pd IDE disk - pcd ATAPI CD-ROM - pf ATAPI disk - pt ATAPI tape - pg ATAPI generic - -(Currently, the pg driver is only used with CD-R drives). - -The high-level drivers function according to the relevant standards. -The third component of PARIDE is a set of low-level protocol drivers -for each of the parallel port IDE adapter chips. Thanks to the interest -and encouragement of Linux users from many parts of the world, -support is available for almost all known adapter protocols: - - aten ATEN EH-100 (HK) - bpck Microsolutions backpack (US) - comm DataStor (old-type) "commuter" adapter (TW) - dstr DataStor EP-2000 (TW) - epat Shuttle EPAT (UK) - epia Shuttle EPIA (UK) - fit2 FIT TD-2000 (US) - fit3 FIT TD-3000 (US) - friq Freecom IQ cable (DE) - frpw Freecom Power (DE) - kbic KingByte KBIC-951A and KBIC-971A (TW) - ktti KT Technology PHd adapter (SG) - on20 OnSpec 90c20 (US) - on26 OnSpec 90c26 (US) - - -2. Using the PARIDE subsystem - -While configuring the Linux kernel, you may choose either to build -the PARIDE drivers into your kernel, or to build them as modules. - -In either case, you will need to select "Parallel port IDE device support" -as well as at least one of the high-level drivers and at least one -of the parallel port communication protocols. If you do not know -what kind of parallel port adapter is used in your drive, you could -begin by checking the file names and any text files on your DOS -installation floppy. Alternatively, you can look at the markings on -the adapter chip itself. That's usually sufficient to identify the -correct device. - -You can actually select all the protocol modules, and allow the PARIDE -subsystem to try them all for you. - -For the "brand-name" products listed above, here are the protocol -and high-level drivers that you would use: - - Manufacturer Model Driver Protocol - - MicroSolutions CD-ROM pcd bpck - MicroSolutions PD drive pf bpck - MicroSolutions hard-drive pd bpck - MicroSolutions 8000t tape pt bpck - SyQuest EZ, SparQ pd epat - Imation Superdisk pf epat - Maxell Superdisk pf friq - Avatar Shark pd epat - FreeCom CD-ROM pcd frpw - Hewlett-Packard 5GB Tape pt epat - Hewlett-Packard 7200e (CD) pcd epat - Hewlett-Packard 7200e (CD-R) pg epat - -2.1 Configuring built-in drivers - -We recommend that you get to know how the drivers work and how to -configure them as loadable modules, before attempting to compile a -kernel with the drivers built-in. - -If you built all of your PARIDE support directly into your kernel, -and you have just a single parallel port IDE device, your kernel should -locate it automatically for you. If you have more than one device, -you may need to give some command line options to your bootloader -(eg: LILO), how to do that is beyond the scope of this document. - -The high-level drivers accept a number of command line parameters, all -of which are documented in the source files in linux/drivers/block/paride. -By default, each driver will automatically try all parallel ports it -can find, and all protocol types that have been installed, until it finds -a parallel port IDE adapter. Once it finds one, the probe stops. So, -if you have more than one device, you will need to tell the drivers -how to identify them. This requires specifying the port address, the -protocol identification number and, for some devices, the drive's -chain ID. While your system is booting, a number of messages are -displayed on the console. Like all such messages, they can be -reviewed with the 'dmesg' command. Among those messages will be -some lines like: - - paride: bpck registered as protocol 0 - paride: epat registered as protocol 1 - -The numbers will always be the same until you build a new kernel with -different protocol selections. You should note these numbers as you -will need them to identify the devices. - -If you happen to be using a MicroSolutions backpack device, you will -also need to know the unit ID number for each drive. This is usually -the last two digits of the drive's serial number (but read MicroSolutions' -documentation about this). - -As an example, let's assume that you have a MicroSolutions PD/CD drive -with unit ID number 36 connected to the parallel port at 0x378, a SyQuest -EZ-135 connected to the chained port on the PD/CD drive and also an -Imation Superdisk connected to port 0x278. You could give the following -options on your boot command: - - pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36 - -In the last option, pf.drive1 configures device /dev/pf1, the 0x378 -is the parallel port base address, the 0 is the protocol registration -number and 36 is the chain ID. - -Please note: while PARIDE will work both with and without the -PARPORT parallel port sharing system that is included by the -"Parallel port support" option, PARPORT must be included and enabled -if you want to use chains of devices on the same parallel port. - -2.2 Loading and configuring PARIDE as modules - -It is much faster and simpler to get to understand the PARIDE drivers -if you use them as loadable kernel modules. - -Note 1: using these drivers with the "kerneld" automatic module loading -system is not recommended for beginners, and is not documented here. - -Note 2: if you build PARPORT support as a loadable module, PARIDE must -also be built as loadable modules, and PARPORT must be loaded before the -PARIDE modules. - -To use PARIDE, you must begin by - - insmod paride - -this loads a base module which provides a registry for the protocols, -among other tasks. - -Then, load as many of the protocol modules as you think you might need. -As you load each module, it will register the protocols that it supports, -and print a log message to your kernel log file and your console. For -example: - - # insmod epat - paride: epat registered as protocol 0 - # insmod kbic - paride: k951 registered as protocol 1 - paride: k971 registered as protocol 2 - -Finally, you can load high-level drivers for each kind of device that -you have connected. By default, each driver will autoprobe for a single -device, but you can support up to four similar devices by giving their -individual co-ordinates when you load the driver. - -For example, if you had two no-name CD-ROM drives both using the -KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc -you could give the following command: - - # insmod pcd drive0=0x378,1 drive1=0x3bc,1 - -For most adapters, giving a port address and protocol number is sufficient, -but check the source files in linux/drivers/block/paride for more -information. (Hopefully someone will write some man pages one day !). - -As another example, here's what happens when PARPORT is installed, and -a SyQuest EZ-135 is attached to port 0x378: - - # insmod paride - paride: version 1.0 installed - # insmod epat - paride: epat registered as protocol 0 - # insmod pd - pd: pd version 1.0, major 45, cluster 64, nice 0 - pda: Sharing parport1 at 0x378 - pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1 - pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media - pda: pda1 - -Note that the last line is the output from the generic partition table -scanner - in this case it reports that it has found a disk with one partition. - -2.3 Using a PARIDE device - -Once the drivers have been loaded, you can access PARIDE devices in the -same way as their traditional counterparts. You will probably need to -create the device "special files". Here is a simple script that you can -cut to a file and execute: - -#!/bin/bash -# -# mkd -- a script to create the device special files for the PARIDE subsystem -# -function mkdev { - mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1 -} -# -function pd { - D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) ) - mkdev pd$D b 45 $[ $1 * 16 ] - for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - do mkdev pd$D$P b 45 $[ $1 * 16 + $P ] - done -} -# -cd /dev -# -for u in 0 1 2 3 ; do pd $u ; done -for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done -for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done -for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done -for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done -for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done -# -# end of mkd - -With the device files and drivers in place, you can access PARIDE devices -like any other Linux device. For example, to mount a CD-ROM in pcd0, use: - - mount /dev/pcd0 /cdrom - -If you have a fresh Avatar Shark cartridge, and the drive is pda, you -might do something like: - - fdisk /dev/pda -- make a new partition table with - partition 1 of type 83 - - mke2fs /dev/pda1 -- to build the file system - - mkdir /shark -- make a place to mount the disk - - mount /dev/pda1 /shark - -Devices like the Imation superdisk work in the same way, except that -they do not have a partition table. For example to make a 120MB -floppy that you could share with a DOS system: - - mkdosfs /dev/pf0 - mount /dev/pf0 /mnt - - -2.4 The pf driver - -The pf driver is intended for use with parallel port ATAPI disk -devices. The most common devices in this category are PD drives -and LS-120 drives. Traditionally, media for these devices are not -partitioned. Consequently, the pf driver does not support partitioned -media. This may be changed in a future version of the driver. - -2.5 Using the pt driver - -The pt driver for parallel port ATAPI tape drives is a minimal driver. -It does not yet support many of the standard tape ioctl operations. -For best performance, a block size of 32KB should be used. You will -probably want to set the parallel port delay to 0, if you can. - -2.6 Using the pg driver - -The pg driver can be used in conjunction with the cdrecord program -to create CD-ROMs. Please get cdrecord version 1.6.1 or later -from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media -your parallel port should ideally be set to EPP mode, and the "port delay" -should be set to 0. With those settings it is possible to record at 2x -speed without any buffer underruns. If you cannot get the driver to work -in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only. - - -3. Troubleshooting - -3.1 Use EPP mode if you can - -The most common problems that people report with the PARIDE drivers -concern the parallel port CMOS settings. At this time, none of the -PARIDE protocol modules support ECP mode, or any ECP combination modes. -If you are able to do so, please set your parallel port into EPP mode -using your CMOS setup procedure. - -3.2 Check the port delay - -Some parallel ports cannot reliably transfer data at full speed. To -offset the errors, the PARIDE protocol modules introduce a "port -delay" between each access to the i/o ports. Each protocol sets -a default value for this delay. In most cases, the user can override -the default and set it to 0 - resulting in somewhat higher transfer -rates. In some rare cases (especially with older 486 systems) the -default delays are not long enough. if you experience corrupt data -transfers, or unexpected failures, you may wish to increase the -port delay. The delay can be programmed using the "driveN" parameters -to each of the high-level drivers. Please see the notes above, or -read the comments at the beginning of the driver source files in -linux/drivers/block/paride. - -3.3 Some drives need a printer reset - -There appear to be a number of "noname" external drives on the market -that do not always power up correctly. We have noticed this with some -drives based on OnSpec and older Freecom adapters. In these rare cases, -the adapter can often be reinitialised by issuing a "printer reset" on -the parallel port. As the reset operation is potentially disruptive in -multiple device environments, the PARIDE drivers will not do it -automatically. You can however, force a printer reset by doing: - - insmod lp reset=1 - rmmod lp - -If you have one of these marginal cases, you should probably build -your paride drivers as modules, and arrange to do the printer reset -before loading the PARIDE drivers. - -3.4 Use the verbose option and dmesg if you need help - -While a lot of testing has gone into these drivers to make them work -as smoothly as possible, problems will arise. If you do have problems, -please check all the obvious things first: does the drive work in -DOS with the manufacturer's drivers ? If that doesn't yield any useful -clues, then please make sure that only one drive is hooked to your system, -and that either (a) PARPORT is enabled or (b) no other device driver -is using your parallel port (check in /proc/ioports). Then, load the -appropriate drivers (you can load several protocol modules if you want) -as in: - - # insmod paride - # insmod epat - # insmod bpck - # insmod kbic - ... - # insmod pd verbose=1 - -(using the correct driver for the type of device you have, of course). -The verbose=1 parameter will cause the drivers to log a trace of their -activity as they attempt to locate your drive. - -Use 'dmesg' to capture a log of all the PARIDE messages (any messages -beginning with paride:, a protocol module's name or a driver's name) and -include that with your bug report. You can submit a bug report in one -of two ways. Either send it directly to the author of the PARIDE suite, -by e-mail to grant@torque.net, or join the linux-parport mailing list -and post your report there. - -3.5 For more information or help - -You can join the linux-parport mailing list by sending a mail message -to - linux-parport-request@torque.net - -with the single word - - subscribe - -in the body of the mail message (not in the subject line). Please be -sure that your mail program is correctly set up when you do this, as -the list manager is a robot that will subscribe you using the reply -address in your mail headers. REMOVE any anti-spam gimmicks you may -have in your mail headers, when sending mail to the list server. - -You might also find some useful information on the linux-parport -web pages (although they are not always up to date) at - - http://web.archive.org/web/*/http://www.torque.net/parport/ - - diff --git a/Documentation/blockdev/ramdisk.rst b/Documentation/blockdev/ramdisk.rst new file mode 100644 index 000000000000..b7c2268f8dec --- /dev/null +++ b/Documentation/blockdev/ramdisk.rst @@ -0,0 +1,177 @@ +========================================== +Using the RAM disk block device with Linux +========================================== + +.. Contents: + + 1) Overview + 2) Kernel Command Line Parameters + 3) Using "rdev -r" + 4) An Example of Creating a Compressed RAM Disk + + +1) Overview +----------- + +The RAM disk driver is a way to use main system memory as a block device. It +is required for initrd, an initial filesystem used if you need to load modules +in order to access the root filesystem (see Documentation/admin-guide/initrd.rst). It can +also be used for a temporary filesystem for crypto work, since the contents +are erased on reboot. + +The RAM disk dynamically grows as more space is required. It does this by using +RAM from the buffer cache. The driver marks the buffers it is using as dirty +so that the VM subsystem does not try to reclaim them later. + +The RAM disk supports up to 16 RAM disks by default, and can be reconfigured +to support an unlimited number of RAM disks (at your own risk). Just change +the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu +and (re)build the kernel. + +To use RAM disk support with your system, run './MAKEDEV ram' from the /dev +directory. RAM disks are all major number 1, and start with minor number 0 +for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd. + +The new RAM disk also has the ability to load compressed RAM disk images, +allowing one to squeeze more programs onto an average installation or +rescue floppy disk. + + +2) Parameters +--------------------------------- + +2a) Kernel Command Line Parameters + + ramdisk_size=N + Size of the ramdisk. + +This parameter tells the RAM disk driver to set up RAM disks of N k size. The +default is 4096 (4 MB). + +2b) Module parameters + + rd_nr + /dev/ramX devices created. + + max_part + Maximum partition number. + + rd_size + See ramdisk_size. + +3) Using "rdev -r" +------------------ + +The usage of the word (two bytes) that "rdev -r" sets in the kernel image is +as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up +to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit +14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a +prompt/wait sequence is to be given before trying to read the RAM disk. Since +the RAM disk dynamically grows as data is being written into it, a size field +is not required. Bits 11 to 13 are not currently used and may as well be zero. +These numbers are no magical secrets, as seen below:: + + ./arch/x86/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK 0x07FF + ./arch/x86/kernel/setup.c:#define RAMDISK_PROMPT_FLAG 0x8000 + ./arch/x86/kernel/setup.c:#define RAMDISK_LOAD_FLAG 0x4000 + +Consider a typical two floppy disk setup, where you will have the +kernel on disk one, and have already put a RAM disk image onto disk #2. + +Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk +starts at an offset of 0 kB from the beginning of the floppy. +The command line equivalent is: "ramdisk_start=0" + +You want bit 14 as one, indicating that a RAM disk is to be loaded. +The command line equivalent is: "load_ramdisk=1" + +You want bit 15 as one, indicating that you want a prompt/keypress +sequence so that you have a chance to switch floppy disks. +The command line equivalent is: "prompt_ramdisk=1" + +Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word. +So to create disk one of the set, you would do:: + + /usr/src/linux# cat arch/x86/boot/zImage > /dev/fd0 + /usr/src/linux# rdev /dev/fd0 /dev/fd0 + /usr/src/linux# rdev -r /dev/fd0 49152 + +If you make a boot disk that has LILO, then for the above, you would use:: + + append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1" + +Since the default start = 0 and the default prompt = 1, you could use:: + + append = "load_ramdisk=1" + + +4) An Example of Creating a Compressed RAM Disk +----------------------------------------------- + +To create a RAM disk image, you will need a spare block device to +construct it on. This can be the RAM disk device itself, or an +unused disk partition (such as an unmounted swap partition). For this +example, we will use the RAM disk device, "/dev/ram0". + +Note: This technique should not be done on a machine with less than 8 MB +of RAM. If using a spare disk partition instead of /dev/ram0, then this +restriction does not apply. + +a) Decide on the RAM disk size that you want. Say 2 MB for this example. + Create it by writing to the RAM disk device. (This step is not currently + required, but may be in the future.) It is wise to zero out the + area (esp. for disks) so that maximal compression is achieved for + the unused blocks of the image that you are about to create:: + + dd if=/dev/zero of=/dev/ram0 bs=1k count=2048 + +b) Make a filesystem on it. Say ext2fs for this example:: + + mke2fs -vm0 /dev/ram0 2048 + +c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...) + and unmount it again. + +d) Compress the contents of the RAM disk. The level of compression + will be approximately 50% of the space used by the files. Unused + space on the RAM disk will compress to almost nothing:: + + dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz + +e) Put the kernel onto the floppy:: + + dd if=zImage of=/dev/fd0 bs=1k + +f) Put the RAM disk image onto the floppy, after the kernel. Use an offset + that is slightly larger than the kernel, so that you can put another + (possibly larger) kernel onto the same floppy later without overlapping + the RAM disk image. An offset of 400 kB for kernels about 350 kB in + size would be reasonable. Make sure offset+size of ram_image.gz is + not larger than the total space on your floppy (usually 1440 kB):: + + dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400 + +g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc. + For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would + have 2^15 + 2^14 + 400 = 49552:: + + rdev /dev/fd0 /dev/fd0 + rdev -r /dev/fd0 49552 + +That is it. You now have your boot/root compressed RAM disk floppy. Some +users may wish to combine steps (d) and (f) by using a pipe. + + + Paul Gortmaker 12/95 + +Changelog: +---------- + +10-22-04 : + Updated to reflect changes in command line options, remove + obsolete references, general cleanup. + James Nelson (james4765@gmail.com) + + +12-95 : + Original Document diff --git a/Documentation/blockdev/ramdisk.txt b/Documentation/blockdev/ramdisk.txt deleted file mode 100644 index 501e12e0323e..000000000000 --- a/Documentation/blockdev/ramdisk.txt +++ /dev/null @@ -1,174 +0,0 @@ -Using the RAM disk block device with Linux ------------------------------------------- - -Contents: - - 1) Overview - 2) Kernel Command Line Parameters - 3) Using "rdev -r" - 4) An Example of Creating a Compressed RAM Disk - - -1) Overview ------------ - -The RAM disk driver is a way to use main system memory as a block device. It -is required for initrd, an initial filesystem used if you need to load modules -in order to access the root filesystem (see Documentation/admin-guide/initrd.rst). It can -also be used for a temporary filesystem for crypto work, since the contents -are erased on reboot. - -The RAM disk dynamically grows as more space is required. It does this by using -RAM from the buffer cache. The driver marks the buffers it is using as dirty -so that the VM subsystem does not try to reclaim them later. - -The RAM disk supports up to 16 RAM disks by default, and can be reconfigured -to support an unlimited number of RAM disks (at your own risk). Just change -the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu -and (re)build the kernel. - -To use RAM disk support with your system, run './MAKEDEV ram' from the /dev -directory. RAM disks are all major number 1, and start with minor number 0 -for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd. - -The new RAM disk also has the ability to load compressed RAM disk images, -allowing one to squeeze more programs onto an average installation or -rescue floppy disk. - - -2) Parameters ---------------------------------- - -2a) Kernel Command Line Parameters - - ramdisk_size=N - ============== - -This parameter tells the RAM disk driver to set up RAM disks of N k size. The -default is 4096 (4 MB). - -2b) Module parameters - - rd_nr - ===== - /dev/ramX devices created. - - max_part - ======== - Maximum partition number. - - rd_size - ======= - See ramdisk_size. - -3) Using "rdev -r" ------------------- - -The usage of the word (two bytes) that "rdev -r" sets in the kernel image is -as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up -to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit -14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a -prompt/wait sequence is to be given before trying to read the RAM disk. Since -the RAM disk dynamically grows as data is being written into it, a size field -is not required. Bits 11 to 13 are not currently used and may as well be zero. -These numbers are no magical secrets, as seen below: - -./arch/x86/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK 0x07FF -./arch/x86/kernel/setup.c:#define RAMDISK_PROMPT_FLAG 0x8000 -./arch/x86/kernel/setup.c:#define RAMDISK_LOAD_FLAG 0x4000 - -Consider a typical two floppy disk setup, where you will have the -kernel on disk one, and have already put a RAM disk image onto disk #2. - -Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk -starts at an offset of 0 kB from the beginning of the floppy. -The command line equivalent is: "ramdisk_start=0" - -You want bit 14 as one, indicating that a RAM disk is to be loaded. -The command line equivalent is: "load_ramdisk=1" - -You want bit 15 as one, indicating that you want a prompt/keypress -sequence so that you have a chance to switch floppy disks. -The command line equivalent is: "prompt_ramdisk=1" - -Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word. -So to create disk one of the set, you would do: - - /usr/src/linux# cat arch/x86/boot/zImage > /dev/fd0 - /usr/src/linux# rdev /dev/fd0 /dev/fd0 - /usr/src/linux# rdev -r /dev/fd0 49152 - -If you make a boot disk that has LILO, then for the above, you would use: - append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1" -Since the default start = 0 and the default prompt = 1, you could use: - append = "load_ramdisk=1" - - -4) An Example of Creating a Compressed RAM Disk ----------------------------------------------- - -To create a RAM disk image, you will need a spare block device to -construct it on. This can be the RAM disk device itself, or an -unused disk partition (such as an unmounted swap partition). For this -example, we will use the RAM disk device, "/dev/ram0". - -Note: This technique should not be done on a machine with less than 8 MB -of RAM. If using a spare disk partition instead of /dev/ram0, then this -restriction does not apply. - -a) Decide on the RAM disk size that you want. Say 2 MB for this example. - Create it by writing to the RAM disk device. (This step is not currently - required, but may be in the future.) It is wise to zero out the - area (esp. for disks) so that maximal compression is achieved for - the unused blocks of the image that you are about to create. - - dd if=/dev/zero of=/dev/ram0 bs=1k count=2048 - -b) Make a filesystem on it. Say ext2fs for this example. - - mke2fs -vm0 /dev/ram0 2048 - -c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...) - and unmount it again. - -d) Compress the contents of the RAM disk. The level of compression - will be approximately 50% of the space used by the files. Unused - space on the RAM disk will compress to almost nothing. - - dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz - -e) Put the kernel onto the floppy - - dd if=zImage of=/dev/fd0 bs=1k - -f) Put the RAM disk image onto the floppy, after the kernel. Use an offset - that is slightly larger than the kernel, so that you can put another - (possibly larger) kernel onto the same floppy later without overlapping - the RAM disk image. An offset of 400 kB for kernels about 350 kB in - size would be reasonable. Make sure offset+size of ram_image.gz is - not larger than the total space on your floppy (usually 1440 kB). - - dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400 - -g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc. - For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would - have 2^15 + 2^14 + 400 = 49552. - - rdev /dev/fd0 /dev/fd0 - rdev -r /dev/fd0 49552 - -That is it. You now have your boot/root compressed RAM disk floppy. Some -users may wish to combine steps (d) and (f) by using a pipe. - --------------------------------------------------------------------------- - Paul Gortmaker 12/95 - -Changelog: ----------- - -10-22-04 : Updated to reflect changes in command line options, remove - obsolete references, general cleanup. - James Nelson (james4765@gmail.com) - - -12-95 : Original Document diff --git a/Documentation/blockdev/zram.rst b/Documentation/blockdev/zram.rst new file mode 100644 index 000000000000..2111231c9c0f --- /dev/null +++ b/Documentation/blockdev/zram.rst @@ -0,0 +1,422 @@ +======================================== +zram: Compressed RAM based block devices +======================================== + +Introduction +============ + +The zram module creates RAM based block devices named /dev/zram +( = 0, 1, ...). Pages written to these disks are compressed and stored +in memory itself. These disks allow very fast I/O and compression provides +good amounts of memory savings. Some of the usecases include /tmp storage, +use as swap disks, various caches under /var and maybe many more :) + +Statistics for individual zram devices are exported through sysfs nodes at +/sys/block/zram/ + +Usage +===== + +There are several ways to configure and manage zram device(-s): + +a) using zram and zram_control sysfs attributes +b) using zramctl utility, provided by util-linux (util-linux@vger.kernel.org). + +In this document we will describe only 'manual' zram configuration steps, +IOW, zram and zram_control sysfs attributes. + +In order to get a better idea about zramctl please consult util-linux +documentation, zramctl man-page or `zramctl --help`. Please be informed +that zram maintainers do not develop/maintain util-linux or zramctl, should +you have any questions please contact util-linux@vger.kernel.org + +Following shows a typical sequence of steps for using zram. + +WARNING +======= + +For the sake of simplicity we skip error checking parts in most of the +examples below. However, it is your sole responsibility to handle errors. + +zram sysfs attributes always return negative values in case of errors. +The list of possible return codes: + +======== ============================================================= +-EBUSY an attempt to modify an attribute that cannot be changed once + the device has been initialised. Please reset device first; +-ENOMEM zram was not able to allocate enough memory to fulfil your + needs; +-EINVAL invalid input has been provided. +======== ============================================================= + +If you use 'echo', the returned value that is changed by 'echo' utility, +and, in general case, something like:: + + echo 3 > /sys/block/zram0/max_comp_streams + if [ $? -ne 0 ]; + handle_error + fi + +should suffice. + +1) Load Module +============== + +:: + + modprobe zram num_devices=4 + This creates 4 devices: /dev/zram{0,1,2,3} + +num_devices parameter is optional and tells zram how many devices should be +pre-created. Default: 1. + +2) Set max number of compression streams +======================================== + +Regardless the value passed to this attribute, ZRAM will always +allocate multiple compression streams - one per online CPUs - thus +allowing several concurrent compression operations. The number of +allocated compression streams goes down when some of the CPUs +become offline. There is no single-compression-stream mode anymore, +unless you are running a UP system or has only 1 CPU online. + +To find out how many streams are currently available:: + + cat /sys/block/zram0/max_comp_streams + +3) Select compression algorithm +=============================== + +Using comp_algorithm device attribute one can see available and +currently selected (shown in square brackets) compression algorithms, +change selected compression algorithm (once the device is initialised +there is no way to change compression algorithm). + +Examples:: + + #show supported compression algorithms + cat /sys/block/zram0/comp_algorithm + lzo [lz4] + + #select lzo compression algorithm + echo lzo > /sys/block/zram0/comp_algorithm + +For the time being, the `comp_algorithm` content does not necessarily +show every compression algorithm supported by the kernel. We keep this +list primarily to simplify device configuration and one can configure +a new device with a compression algorithm that is not listed in +`comp_algorithm`. The thing is that, internally, ZRAM uses Crypto API +and, if some of the algorithms were built as modules, it's impossible +to list all of them using, for instance, /proc/crypto or any other +method. This, however, has an advantage of permitting the usage of +custom crypto compression modules (implementing S/W or H/W compression). + +4) Set Disksize +=============== + +Set disk size by writing the value to sysfs node 'disksize'. +The value can be either in bytes or you can use mem suffixes. +Examples:: + + # Initialize /dev/zram0 with 50MB disksize + echo $((50*1024*1024)) > /sys/block/zram0/disksize + + # Using mem suffixes + echo 256K > /sys/block/zram0/disksize + echo 512M > /sys/block/zram0/disksize + echo 1G > /sys/block/zram0/disksize + +Note: +There is little point creating a zram of greater than twice the size of memory +since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the +size of the disk when not in use so a huge zram is wasteful. + +5) Set memory limit: Optional +============================= + +Set memory limit by writing the value to sysfs node 'mem_limit'. +The value can be either in bytes or you can use mem suffixes. +In addition, you could change the value in runtime. +Examples:: + + # limit /dev/zram0 with 50MB memory + echo $((50*1024*1024)) > /sys/block/zram0/mem_limit + + # Using mem suffixes + echo 256K > /sys/block/zram0/mem_limit + echo 512M > /sys/block/zram0/mem_limit + echo 1G > /sys/block/zram0/mem_limit + + # To disable memory limit + echo 0 > /sys/block/zram0/mem_limit + +6) Activate +=========== + +:: + + mkswap /dev/zram0 + swapon /dev/zram0 + + mkfs.ext4 /dev/zram1 + mount /dev/zram1 /tmp + +7) Add/remove zram devices +========================== + +zram provides a control interface, which enables dynamic (on-demand) device +addition and removal. + +In order to add a new /dev/zramX device, perform read operation on hot_add +attribute. This will return either new device's device id (meaning that you +can use /dev/zram) or error code. + +Example:: + + cat /sys/class/zram-control/hot_add + 1 + +To remove the existing /dev/zramX device (where X is a device id) +execute:: + + echo X > /sys/class/zram-control/hot_remove + +8) Stats +======== + +Per-device statistics are exported as various nodes under /sys/block/zram/ + +A brief description of exported device attributes. For more details please +read Documentation/ABI/testing/sysfs-block-zram. + +====================== ====== =============================================== +Name access description +====================== ====== =============================================== +disksize RW show and set the device's disk size +initstate RO shows the initialization state of the device +reset WO trigger device reset +mem_used_max WO reset the `mem_used_max` counter (see later) +mem_limit WO specifies the maximum amount of memory ZRAM can + use to store the compressed data +writeback_limit WO specifies the maximum amount of write IO zram + can write out to backing device as 4KB unit +writeback_limit_enable RW show and set writeback_limit feature +max_comp_streams RW the number of possible concurrent compress + operations +comp_algorithm RW show and change the compression algorithm +compact WO trigger memory compaction +debug_stat RO this file is used for zram debugging purposes +backing_dev RW set up backend storage for zram to write out +idle WO mark allocated slot as idle +====================== ====== =============================================== + + +User space is advised to use the following files to read the device statistics. + +File /sys/block/zram/stat + +Represents block layer statistics. Read Documentation/block/stat.txt for +details. + +File /sys/block/zram/io_stat + +The stat file represents device's I/O statistics not accounted by block +layer and, thus, not available in zram/stat file. It consists of a +single line of text and contains the following stats separated by +whitespace: + + ============= ============================================================= + failed_reads The number of failed reads + failed_writes The number of failed writes + invalid_io The number of non-page-size-aligned I/O requests + notify_free Depending on device usage scenario it may account + + a) the number of pages freed because of swap slot free + notifications + b) the number of pages freed because of + REQ_OP_DISCARD requests sent by bio. The former ones are + sent to a swap block device when a swap slot is freed, + which implies that this disk is being used as a swap disk. + + The latter ones are sent by filesystem mounted with + discard option, whenever some data blocks are getting + discarded. + ============= ============================================================= + +File /sys/block/zram/mm_stat + +The stat file represents device's mm statistics. It consists of a single +line of text and contains the following stats separated by whitespace: + + ================ ============================================================= + orig_data_size uncompressed size of data stored in this disk. + This excludes same-element-filled pages (same_pages) since + no memory is allocated for them. + Unit: bytes + compr_data_size compressed size of data stored in this disk + mem_used_total the amount of memory allocated for this disk. This + includes allocator fragmentation and metadata overhead, + allocated for this disk. So, allocator space efficiency + can be calculated using compr_data_size and this statistic. + Unit: bytes + mem_limit the maximum amount of memory ZRAM can use to store + the compressed data + mem_used_max the maximum amount of memory zram have consumed to + store the data + same_pages the number of same element filled pages written to this disk. + No memory is allocated for such pages. + pages_compacted the number of pages freed during compaction + huge_pages the number of incompressible pages + ================ ============================================================= + +File /sys/block/zram/bd_stat + +The stat file represents device's backing device statistics. It consists of +a single line of text and contains the following stats separated by whitespace: + + ============== ============================================================= + bd_count size of data written in backing device. + Unit: 4K bytes + bd_reads the number of reads from backing device + Unit: 4K bytes + bd_writes the number of writes to backing device + Unit: 4K bytes + ============== ============================================================= + +9) Deactivate +============= + +:: + + swapoff /dev/zram0 + umount /dev/zram1 + +10) Reset +========= + + Write any positive value to 'reset' sysfs node:: + + echo 1 > /sys/block/zram0/reset + echo 1 > /sys/block/zram1/reset + + This frees all the memory allocated for the given device and + resets the disksize to zero. You must set the disksize again + before reusing the device. + +Optional Feature +================ + +writeback +--------- + +With CONFIG_ZRAM_WRITEBACK, zram can write idle/incompressible page +to backing storage rather than keeping it in memory. +To use the feature, admin should set up backing device via:: + + echo /dev/sda5 > /sys/block/zramX/backing_dev + +before disksize setting. It supports only partition at this moment. +If admin want to use incompressible page writeback, they could do via:: + + echo huge > /sys/block/zramX/write + +To use idle page writeback, first, user need to declare zram pages +as idle:: + + echo all > /sys/block/zramX/idle + +From now on, any pages on zram are idle pages. The idle mark +will be removed until someone request access of the block. +IOW, unless there is access request, those pages are still idle pages. + +Admin can request writeback of those idle pages at right timing via:: + + echo idle > /sys/block/zramX/writeback + +With the command, zram writeback idle pages from memory to the storage. + +If there are lots of write IO with flash device, potentially, it has +flash wearout problem so that admin needs to design write limitation +to guarantee storage health for entire product life. + +To overcome the concern, zram supports "writeback_limit" feature. +The "writeback_limit_enable"'s default value is 0 so that it doesn't limit +any writeback. IOW, if admin want to apply writeback budget, he should +enable writeback_limit_enable via:: + + $ echo 1 > /sys/block/zramX/writeback_limit_enable + +Once writeback_limit_enable is set, zram doesn't allow any writeback +until admin set the budget via /sys/block/zramX/writeback_limit. + +(If admin doesn't enable writeback_limit_enable, writeback_limit's value +assigned via /sys/block/zramX/writeback_limit is meaninless.) + +If admin want to limit writeback as per-day 400M, he could do it +like below:: + + $ MB_SHIFT=20 + $ 4K_SHIFT=12 + $ echo $((400<>4K_SHIFT)) > \ + /sys/block/zram0/writeback_limit. + $ echo 1 > /sys/block/zram0/writeback_limit_enable + +If admin want to allow further write again once the bugdet is exausted, +he could do it like below:: + + $ echo $((400<>4K_SHIFT)) > \ + /sys/block/zram0/writeback_limit + +If admin want to see remaining writeback budget since he set:: + + $ cat /sys/block/zramX/writeback_limit + +If admin want to disable writeback limit, he could do:: + + $ echo 0 > /sys/block/zramX/writeback_limit_enable + +The writeback_limit count will reset whenever you reset zram(e.g., +system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of +writeback happened until you reset the zram to allocate extra writeback +budget in next setting is user's job. + +If admin want to measure writeback count in a certain period, he could +know it via /sys/block/zram0/bd_stat's 3rd column. + +memory tracking +=============== + +With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the +zram block. It could be useful to catch cold or incompressible +pages of the process with*pagemap. + +If you enable the feature, you could see block state via +/sys/kernel/debug/zram/zram0/block_state". The output is as follows:: + + 300 75.033841 .wh. + 301 63.806904 s... + 302 63.806919 ..hi + +First column + zram's block index. +Second column + access time since the system was booted +Third column + state of the block: + + s: + same page + w: + written page to backing store + h: + huge page + i: + idle page + +First line of above example says 300th block is accessed at 75.033841sec +and the block's state is huge so it is written back to the backing +storage. It's a debugging feature so anyone shouldn't rely on it to work +properly. + +Nitin Gupta +ngupta@vflare.org diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt deleted file mode 100644 index 4df0ce271085..000000000000 --- a/Documentation/blockdev/zram.txt +++ /dev/null @@ -1,355 +0,0 @@ -zram: Compressed RAM based block devices ----------------------------------------- - -* Introduction - -The zram module creates RAM based block devices named /dev/zram -( = 0, 1, ...). Pages written to these disks are compressed and stored -in memory itself. These disks allow very fast I/O and compression provides -good amounts of memory savings. Some of the usecases include /tmp storage, -use as swap disks, various caches under /var and maybe many more :) - -Statistics for individual zram devices are exported through sysfs nodes at -/sys/block/zram/ - -* Usage - -There are several ways to configure and manage zram device(-s): -a) using zram and zram_control sysfs attributes -b) using zramctl utility, provided by util-linux (util-linux@vger.kernel.org). - -In this document we will describe only 'manual' zram configuration steps, -IOW, zram and zram_control sysfs attributes. - -In order to get a better idea about zramctl please consult util-linux -documentation, zramctl man-page or `zramctl --help'. Please be informed -that zram maintainers do not develop/maintain util-linux or zramctl, should -you have any questions please contact util-linux@vger.kernel.org - -Following shows a typical sequence of steps for using zram. - -WARNING -======= -For the sake of simplicity we skip error checking parts in most of the -examples below. However, it is your sole responsibility to handle errors. - -zram sysfs attributes always return negative values in case of errors. -The list of possible return codes: --EBUSY -- an attempt to modify an attribute that cannot be changed once -the device has been initialised. Please reset device first; --ENOMEM -- zram was not able to allocate enough memory to fulfil your -needs; --EINVAL -- invalid input has been provided. - -If you use 'echo', the returned value that is changed by 'echo' utility, -and, in general case, something like: - - echo 3 > /sys/block/zram0/max_comp_streams - if [ $? -ne 0 ]; - handle_error - fi - -should suffice. - -1) Load Module: - modprobe zram num_devices=4 - This creates 4 devices: /dev/zram{0,1,2,3} - -num_devices parameter is optional and tells zram how many devices should be -pre-created. Default: 1. - -2) Set max number of compression streams -Regardless the value passed to this attribute, ZRAM will always -allocate multiple compression streams - one per online CPUs - thus -allowing several concurrent compression operations. The number of -allocated compression streams goes down when some of the CPUs -become offline. There is no single-compression-stream mode anymore, -unless you are running a UP system or has only 1 CPU online. - -To find out how many streams are currently available: - cat /sys/block/zram0/max_comp_streams - -3) Select compression algorithm -Using comp_algorithm device attribute one can see available and -currently selected (shown in square brackets) compression algorithms, -change selected compression algorithm (once the device is initialised -there is no way to change compression algorithm). - -Examples: - #show supported compression algorithms - cat /sys/block/zram0/comp_algorithm - lzo [lz4] - - #select lzo compression algorithm - echo lzo > /sys/block/zram0/comp_algorithm - -For the time being, the `comp_algorithm' content does not necessarily -show every compression algorithm supported by the kernel. We keep this -list primarily to simplify device configuration and one can configure -a new device with a compression algorithm that is not listed in -`comp_algorithm'. The thing is that, internally, ZRAM uses Crypto API -and, if some of the algorithms were built as modules, it's impossible -to list all of them using, for instance, /proc/crypto or any other -method. This, however, has an advantage of permitting the usage of -custom crypto compression modules (implementing S/W or H/W compression). - -4) Set Disksize -Set disk size by writing the value to sysfs node 'disksize'. -The value can be either in bytes or you can use mem suffixes. -Examples: - # Initialize /dev/zram0 with 50MB disksize - echo $((50*1024*1024)) > /sys/block/zram0/disksize - - # Using mem suffixes - echo 256K > /sys/block/zram0/disksize - echo 512M > /sys/block/zram0/disksize - echo 1G > /sys/block/zram0/disksize - -Note: -There is little point creating a zram of greater than twice the size of memory -since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the -size of the disk when not in use so a huge zram is wasteful. - -5) Set memory limit: Optional -Set memory limit by writing the value to sysfs node 'mem_limit'. -The value can be either in bytes or you can use mem suffixes. -In addition, you could change the value in runtime. -Examples: - # limit /dev/zram0 with 50MB memory - echo $((50*1024*1024)) > /sys/block/zram0/mem_limit - - # Using mem suffixes - echo 256K > /sys/block/zram0/mem_limit - echo 512M > /sys/block/zram0/mem_limit - echo 1G > /sys/block/zram0/mem_limit - - # To disable memory limit - echo 0 > /sys/block/zram0/mem_limit - -6) Activate: - mkswap /dev/zram0 - swapon /dev/zram0 - - mkfs.ext4 /dev/zram1 - mount /dev/zram1 /tmp - -7) Add/remove zram devices - -zram provides a control interface, which enables dynamic (on-demand) device -addition and removal. - -In order to add a new /dev/zramX device, perform read operation on hot_add -attribute. This will return either new device's device id (meaning that you -can use /dev/zram) or error code. - -Example: - cat /sys/class/zram-control/hot_add - 1 - -To remove the existing /dev/zramX device (where X is a device id) -execute - echo X > /sys/class/zram-control/hot_remove - -8) Stats: -Per-device statistics are exported as various nodes under /sys/block/zram/ - -A brief description of exported device attributes. For more details please -read Documentation/ABI/testing/sysfs-block-zram. - -Name access description ----- ------ ----------- -disksize RW show and set the device's disk size -initstate RO shows the initialization state of the device -reset WO trigger device reset -mem_used_max WO reset the `mem_used_max' counter (see later) -mem_limit WO specifies the maximum amount of memory ZRAM can use - to store the compressed data -writeback_limit WO specifies the maximum amount of write IO zram can - write out to backing device as 4KB unit -writeback_limit_enable RW show and set writeback_limit feature -max_comp_streams RW the number of possible concurrent compress operations -comp_algorithm RW show and change the compression algorithm -compact WO trigger memory compaction -debug_stat RO this file is used for zram debugging purposes -backing_dev RW set up backend storage for zram to write out -idle WO mark allocated slot as idle - - -User space is advised to use the following files to read the device statistics. - -File /sys/block/zram/stat - -Represents block layer statistics. Read Documentation/block/stat.txt for -details. - -File /sys/block/zram/io_stat - -The stat file represents device's I/O statistics not accounted by block -layer and, thus, not available in zram/stat file. It consists of a -single line of text and contains the following stats separated by -whitespace: - failed_reads the number of failed reads - failed_writes the number of failed writes - invalid_io the number of non-page-size-aligned I/O requests - notify_free Depending on device usage scenario it may account - a) the number of pages freed because of swap slot free - notifications or b) the number of pages freed because of - REQ_OP_DISCARD requests sent by bio. The former ones are - sent to a swap block device when a swap slot is freed, - which implies that this disk is being used as a swap disk. - The latter ones are sent by filesystem mounted with - discard option, whenever some data blocks are getting - discarded. - -File /sys/block/zram/mm_stat - -The stat file represents device's mm statistics. It consists of a single -line of text and contains the following stats separated by whitespace: - orig_data_size uncompressed size of data stored in this disk. - This excludes same-element-filled pages (same_pages) since - no memory is allocated for them. - Unit: bytes - compr_data_size compressed size of data stored in this disk - mem_used_total the amount of memory allocated for this disk. This - includes allocator fragmentation and metadata overhead, - allocated for this disk. So, allocator space efficiency - can be calculated using compr_data_size and this statistic. - Unit: bytes - mem_limit the maximum amount of memory ZRAM can use to store - the compressed data - mem_used_max the maximum amount of memory zram have consumed to - store the data - same_pages the number of same element filled pages written to this disk. - No memory is allocated for such pages. - pages_compacted the number of pages freed during compaction - huge_pages the number of incompressible pages - -File /sys/block/zram/bd_stat - -The stat file represents device's backing device statistics. It consists of -a single line of text and contains the following stats separated by whitespace: - bd_count size of data written in backing device. - Unit: 4K bytes - bd_reads the number of reads from backing device - Unit: 4K bytes - bd_writes the number of writes to backing device - Unit: 4K bytes - -9) Deactivate: - swapoff /dev/zram0 - umount /dev/zram1 - -10) Reset: - Write any positive value to 'reset' sysfs node - echo 1 > /sys/block/zram0/reset - echo 1 > /sys/block/zram1/reset - - This frees all the memory allocated for the given device and - resets the disksize to zero. You must set the disksize again - before reusing the device. - -* Optional Feature - -= writeback - -With CONFIG_ZRAM_WRITEBACK, zram can write idle/incompressible page -to backing storage rather than keeping it in memory. -To use the feature, admin should set up backing device via - - "echo /dev/sda5 > /sys/block/zramX/backing_dev" - -before disksize setting. It supports only partition at this moment. -If admin want to use incompressible page writeback, they could do via - - "echo huge > /sys/block/zramX/write" - -To use idle page writeback, first, user need to declare zram pages -as idle. - - "echo all > /sys/block/zramX/idle" - -From now on, any pages on zram are idle pages. The idle mark -will be removed until someone request access of the block. -IOW, unless there is access request, those pages are still idle pages. - -Admin can request writeback of those idle pages at right timing via - - "echo idle > /sys/block/zramX/writeback" - -With the command, zram writeback idle pages from memory to the storage. - -If there are lots of write IO with flash device, potentially, it has -flash wearout problem so that admin needs to design write limitation -to guarantee storage health for entire product life. - -To overcome the concern, zram supports "writeback_limit" feature. -The "writeback_limit_enable"'s default value is 0 so that it doesn't limit -any writeback. IOW, if admin want to apply writeback budget, he should -enable writeback_limit_enable via - - $ echo 1 > /sys/block/zramX/writeback_limit_enable - -Once writeback_limit_enable is set, zram doesn't allow any writeback -until admin set the budget via /sys/block/zramX/writeback_limit. - -(If admin doesn't enable writeback_limit_enable, writeback_limit's value -assigned via /sys/block/zramX/writeback_limit is meaninless.) - -If admin want to limit writeback as per-day 400M, he could do it -like below. - - $ MB_SHIFT=20 - $ 4K_SHIFT=12 - $ echo $((400<>4K_SHIFT)) > \ - /sys/block/zram0/writeback_limit. - $ echo 1 > /sys/block/zram0/writeback_limit_enable - -If admin want to allow further write again once the bugdet is exausted, -he could do it like below - - $ echo $((400<>4K_SHIFT)) > \ - /sys/block/zram0/writeback_limit - -If admin want to see remaining writeback budget since he set, - - $ cat /sys/block/zramX/writeback_limit - -If admin want to disable writeback limit, he could do - - $ echo 0 > /sys/block/zramX/writeback_limit_enable - -The writeback_limit count will reset whenever you reset zram(e.g., -system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of -writeback happened until you reset the zram to allocate extra writeback -budget in next setting is user's job. - -If admin want to measure writeback count in a certain period, he could -know it via /sys/block/zram0/bd_stat's 3rd column. - -= memory tracking - -With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the -zram block. It could be useful to catch cold or incompressible -pages of the process with*pagemap. -If you enable the feature, you could see block state via -/sys/kernel/debug/zram/zram0/block_state". The output is as follows, - - 300 75.033841 .wh. - 301 63.806904 s... - 302 63.806919 ..hi - -First column is zram's block index. -Second column is access time since the system was booted -Third column is state of the block. -(s: same page -w: written page to backing store -h: huge page -i: idle page) - -First line of above example says 300th block is accessed at 75.033841sec -and the block's state is huge so it is written back to the backing -storage. It's a debugging feature so anyone shouldn't rely on it to work -properly. - -Nitin Gupta -ngupta@vflare.org diff --git a/MAINTAINERS b/MAINTAINERS index 3ee73751f56c..ec541c8dc645 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11076,7 +11076,7 @@ M: Josef Bacik S: Maintained L: linux-block@vger.kernel.org L: nbd@other.debian.org -F: Documentation/blockdev/nbd.txt +F: Documentation/blockdev/nbd.rst F: drivers/block/nbd.c F: include/trace/events/nbd.h F: include/uapi/linux/nbd.h @@ -12086,7 +12086,7 @@ PARIDE DRIVERS FOR PARALLEL PORT IDE DEVICES M: Tim Waugh L: linux-parport@lists.infradead.org (subscribers-only) S: Maintained -F: Documentation/blockdev/paride.txt +F: Documentation/blockdev/paride.rst F: drivers/block/paride/ PARISC ARCHITECTURE @@ -13367,7 +13367,7 @@ F: drivers/net/wireless/ralink/rt2x00/ RAMDISK RAM BLOCK DEVICE DRIVER M: Jens Axboe S: Maintained -F: Documentation/blockdev/ramdisk.txt +F: Documentation/blockdev/ramdisk.rst F: drivers/block/brd.c RANCHU VIRTUAL BOARD FOR MIPS @@ -17723,7 +17723,7 @@ R: Sergey Senozhatsky L: linux-kernel@vger.kernel.org S: Maintained F: drivers/block/zram/ -F: Documentation/blockdev/zram.txt +F: Documentation/blockdev/zram.rst ZS DECSTATION Z85C30 SERIAL DRIVER M: "Maciej W. Rozycki" diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 96ec7e0fc1ea..c43690b973d8 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -31,7 +31,7 @@ config BLK_DEV_FD If you want to use the floppy disk drive(s) of your PC under Linux, say Y. Information about this driver, especially important for IBM Thinkpad users, is contained in - . + . That file also contains the location of the Floppy driver FAQ as well as location of the fdutils package used to configure additional parameters of the driver at run time. @@ -96,7 +96,7 @@ config PARIDE your computer's parallel port. Most of them are actually IDE devices using a parallel port IDE adapter. This option enables the PARIDE subsystem which contains drivers for many of these external drives. - Read for more information. + Read for more information. If you have said Y to the "Parallel-port support" configuration option, you may share a single port between your printer and other @@ -261,7 +261,7 @@ config BLK_DEV_NBD userland (making server and client physically the same computer, communicating using the loopback network device). - Read for more information, + Read for more information, especially about where to find the server code, which runs in user space and does not need special kernel support. @@ -303,7 +303,7 @@ config BLK_DEV_RAM during the initial install of Linux. Note that the kernel command line option "ramdisk=XX" is now obsolete. - For details, read . + For details, read . To compile this driver as a module, choose M here: the module will be called brd. An alias "rd" has been defined diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index b933a7eea52b..5c99e52f9dc1 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4424,7 +4424,7 @@ static int __init floppy_setup(char *str) pr_cont("\n"); } else DPRINT("botched floppy option\n"); - DPRINT("Read Documentation/blockdev/floppy.txt\n"); + DPRINT("Read Documentation/blockdev/floppy.rst\n"); return 0; } diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 1ffc64770643..e06b99d54816 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -12,7 +12,7 @@ config ZRAM It has several use cases, for example: /tmp storage, use as swap disks and maybe many more. - See Documentation/blockdev/zram.txt for more information. + See Documentation/blockdev/zram.rst for more information. config ZRAM_WRITEBACK bool "Write back incompressible or idle page to backing device" @@ -26,7 +26,7 @@ config ZRAM_WRITEBACK With /sys/block/zramX/{idle,writeback}, application could ask idle page's writeback to the backing device to save in memory. - See Documentation/blockdev/zram.txt for more information. + See Documentation/blockdev/zram.rst for more information. config ZRAM_MEMORY_TRACKING bool "Track zRam block status" @@ -36,4 +36,4 @@ config ZRAM_MEMORY_TRACKING of zRAM. Admin could see the information via /sys/kernel/debug/zram/zramX/block_state. - See Documentation/blockdev/zram.txt for more information. + See Documentation/blockdev/zram.rst for more information. diff --git a/tools/testing/selftests/zram/README b/tools/testing/selftests/zram/README index 7972cc512408..5fa378391d3b 100644 --- a/tools/testing/selftests/zram/README +++ b/tools/testing/selftests/zram/README @@ -37,4 +37,4 @@ Commands required for testing: - mkfs/ mkfs.ext4 For more information please refer: -kernel-source-tree/Documentation/blockdev/zram.txt +kernel-source-tree/Documentation/blockdev/zram.rst -- cgit v1.2.3-55-g7522 From e7751617dd0599ceadf4221cb08e04307b00aa1f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 18 Jun 2019 11:47:10 -0300 Subject: docs: blockdev: add it to the admin-guide The blockdev book basically contains user-faced documentation. Signed-off-by: Mauro Carvalho Chehab --- .../blockdev/drbd/DRBD-8.3-data-packets.svg | 588 +++++++++++++++++++++ .../blockdev/drbd/DRBD-data-packets.svg | 459 ++++++++++++++++ .../admin-guide/blockdev/drbd/conn-states-8.dot | 18 + .../blockdev/drbd/data-structure-v9.rst | 42 ++ .../admin-guide/blockdev/drbd/disk-states-8.dot | 16 + .../drbd/drbd-connection-state-overview.dot | 85 +++ .../admin-guide/blockdev/drbd/figures.rst | 28 + Documentation/admin-guide/blockdev/drbd/index.rst | 19 + .../admin-guide/blockdev/drbd/node-states-8.dot | 13 + Documentation/admin-guide/blockdev/floppy.rst | 255 +++++++++ Documentation/admin-guide/blockdev/index.rst | 14 + Documentation/admin-guide/blockdev/nbd.rst | 31 ++ Documentation/admin-guide/blockdev/paride.rst | 439 +++++++++++++++ Documentation/admin-guide/blockdev/ramdisk.rst | 177 +++++++ Documentation/admin-guide/blockdev/zram.rst | 422 +++++++++++++++ Documentation/admin-guide/index.rst | 1 + Documentation/admin-guide/kernel-parameters.txt | 18 +- .../blockdev/drbd/DRBD-8.3-data-packets.svg | 588 --------------------- Documentation/blockdev/drbd/DRBD-data-packets.svg | 459 ---------------- Documentation/blockdev/drbd/conn-states-8.dot | 18 - Documentation/blockdev/drbd/data-structure-v9.rst | 42 -- Documentation/blockdev/drbd/disk-states-8.dot | 16 - .../drbd/drbd-connection-state-overview.dot | 85 --- Documentation/blockdev/drbd/figures.rst | 28 - Documentation/blockdev/drbd/index.rst | 19 - Documentation/blockdev/drbd/node-states-8.dot | 14 - Documentation/blockdev/floppy.rst | 255 --------- Documentation/blockdev/index.rst | 16 - Documentation/blockdev/nbd.rst | 31 -- Documentation/blockdev/paride.rst | 439 --------------- Documentation/blockdev/ramdisk.rst | 177 ------- Documentation/blockdev/zram.rst | 422 --------------- MAINTAINERS | 10 +- drivers/block/Kconfig | 8 +- drivers/block/floppy.c | 2 +- drivers/block/zram/Kconfig | 6 +- tools/testing/selftests/zram/README | 2 +- 37 files changed, 2630 insertions(+), 2632 deletions(-) create mode 100644 Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg create mode 100644 Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg create mode 100644 Documentation/admin-guide/blockdev/drbd/conn-states-8.dot create mode 100644 Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst create mode 100644 Documentation/admin-guide/blockdev/drbd/disk-states-8.dot create mode 100644 Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot create mode 100644 Documentation/admin-guide/blockdev/drbd/figures.rst create mode 100644 Documentation/admin-guide/blockdev/drbd/index.rst create mode 100644 Documentation/admin-guide/blockdev/drbd/node-states-8.dot create mode 100644 Documentation/admin-guide/blockdev/floppy.rst create mode 100644 Documentation/admin-guide/blockdev/index.rst create mode 100644 Documentation/admin-guide/blockdev/nbd.rst create mode 100644 Documentation/admin-guide/blockdev/paride.rst create mode 100644 Documentation/admin-guide/blockdev/ramdisk.rst create mode 100644 Documentation/admin-guide/blockdev/zram.rst delete mode 100644 Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg delete mode 100644 Documentation/blockdev/drbd/DRBD-data-packets.svg delete mode 100644 Documentation/blockdev/drbd/conn-states-8.dot delete mode 100644 Documentation/blockdev/drbd/data-structure-v9.rst delete mode 100644 Documentation/blockdev/drbd/disk-states-8.dot delete mode 100644 Documentation/blockdev/drbd/drbd-connection-state-overview.dot delete mode 100644 Documentation/blockdev/drbd/figures.rst delete mode 100644 Documentation/blockdev/drbd/index.rst delete mode 100644 Documentation/blockdev/drbd/node-states-8.dot delete mode 100644 Documentation/blockdev/floppy.rst delete mode 100644 Documentation/blockdev/index.rst delete mode 100644 Documentation/blockdev/nbd.rst delete mode 100644 Documentation/blockdev/paride.rst delete mode 100644 Documentation/blockdev/ramdisk.rst delete mode 100644 Documentation/blockdev/zram.rst (limited to 'tools/testing') diff --git a/Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg b/Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg new file mode 100644 index 000000000000..f87cfa0dc2fb --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg @@ -0,0 +1,588 @@ + + + + + + Master slide + + + + + + + + + + RSDataReply + + + + + + + CsumRSRequest + + + + w_make_resync_request() + + + receive_DataRequest() + + + drbd_endio_read_sec() + + + w_e_end_csum_rs_req() + + + receive_RSDataReply() + + + drbd_endio_write_sec() + + + e_end_resync_block() + + + + + + WriteAck + + + + got_BlockAck() + + + Checksum based Resync, case not in sync + + + DRBD-8.3 data flow + + + w_e_send_csum() + + + + + + + + RSIsInSync + + + + + + + CsumRSRequest + + + + receive_DataRequest() + + + drbd_endio_read_sec() + + + w_e_end_csum_rs_req() + + + got_IsInSync() + + + Checksum based Resync, case in sync + + + + + + + + + + OVReply + + + + + + + OVRequest + + + + receive_OVRequest() + + + drbd_endio_read_sec() + + + w_e_end_ov_req() + + + receive_OVReply() + + + drbd_endio_read_sec() + + + w_e_end_ov_reply() + + + + + + OVResult + + + + got_OVResult() + + + Online verify + + + w_make_ov_request() + + + + + + + + drbd_endio_read_sec() + + + w_make_resync_request() + + + w_e_send_csum() + + + + + drbd_endio_read_sec() + + + + + + rs_begin_io() + + + rs_begin_io() + + + rs_begin_io() + + + rs_complete_io() + + + rs_complete_io() + + + rs_complete_io() + + + rs_begin_io() + + + rs_begin_io() + + + rs_begin_io() + + + rs_complete_io() + + + rs_complete_io() + + + rs_complete_io() + + diff --git a/Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg b/Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg new file mode 100644 index 000000000000..48a1e2165fec --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg @@ -0,0 +1,459 @@ + + + + + + Master slide + + + + + + + + + RSDataReply + + + + + RSDataRequest + + + w_make_resync_request() + + + receive_DataRequest() + + + drbd_endio_read_sec() + + + w_e_end_rsdata_req() + + + receive_RSDataReply() + + + drbd_endio_write_sec() + + + e_end_resync_block() + + + + + WriteAck + + + got_BlockAck() + + + Resync blocks, 4-32K + + + + + + + WriteAck + + + + + Data + + + drbd_make_request() + + + receive_Data() + + + drbd_endio_write_sec() + + + e_end_block() + + + got_BlockAck() + + + Regular mirrored write, 512-32K + + + w_send_dblock() + + + + + drbd_endio_write_pri() + + + + + + + DataReply + + + + + DataRequest + + + drbd_make_request() + + + receive_DataRequest() + + + drbd_endio_read_sec() + + + w_e_end_data_req() + + + Drawing + + receive_DataReply() + + + + Diskless read, 512-32K + + + w_send_read_req() + + + DRBD 8 data flow + + + + + + al_begin_io() + + + al_complete_io() + + + rs_begin_io() + + + rs_complete_io() + + + rs_begin_io() + + + rs_complete_io() + + diff --git a/Documentation/admin-guide/blockdev/drbd/conn-states-8.dot b/Documentation/admin-guide/blockdev/drbd/conn-states-8.dot new file mode 100644 index 000000000000..025e8cf5e64a --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/conn-states-8.dot @@ -0,0 +1,18 @@ +digraph conn_states { + StandAllone -> WFConnection [ label = "ioctl_set_net()" ] + WFConnection -> Unconnected [ label = "unable to bind()" ] + WFConnection -> WFReportParams [ label = "in connect() after accept" ] + WFReportParams -> StandAllone [ label = "checks in receive_param()" ] + WFReportParams -> Connected [ label = "in receive_param()" ] + WFReportParams -> WFBitMapS [ label = "sync_handshake()" ] + WFReportParams -> WFBitMapT [ label = "sync_handshake()" ] + WFBitMapS -> SyncSource [ label = "receive_bitmap()" ] + WFBitMapT -> SyncTarget [ label = "receive_bitmap()" ] + SyncSource -> Connected + SyncTarget -> Connected + SyncSource -> PausedSyncS + SyncTarget -> PausedSyncT + PausedSyncS -> SyncSource + PausedSyncT -> SyncTarget + Connected -> WFConnection [ label = "* on network error" ] +} diff --git a/Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst b/Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst new file mode 100644 index 000000000000..66036b901644 --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst @@ -0,0 +1,42 @@ +================================ +kernel data structure for DRBD-9 +================================ + +This describes the in kernel data structure for DRBD-9. Starting with +Linux v3.14 we are reorganizing DRBD to use this data structure. + +Basic Data Structure +==================== + +A node has a number of DRBD resources. Each such resource has a number of +devices (aka volumes) and connections to other nodes ("peer nodes"). Each DRBD +device is represented by a block device locally. + +The DRBD objects are interconnected to form a matrix as depicted below; a +drbd_peer_device object sits at each intersection between a drbd_device and a +drbd_connection:: + + /--------------+---------------+.....+---------------\ + | resource | device | | device | + +--------------+---------------+.....+---------------+ + | connection | peer_device | | peer_device | + +--------------+---------------+.....+---------------+ + : : : : : + : : : : : + +--------------+---------------+.....+---------------+ + | connection | peer_device | | peer_device | + \--------------+---------------+.....+---------------/ + +In this table, horizontally, devices can be accessed from resources by their +volume number. Likewise, peer_devices can be accessed from connections by +their volume number. Objects in the vertical direction are connected by double +linked lists. There are back pointers from peer_devices to their connections a +devices, and from connections and devices to their resource. + +All resources are in the drbd_resources double-linked list. In addition, all +devices can be accessed by their minor device number via the drbd_devices idr. + +The drbd_resource, drbd_connection, and drbd_device objects are reference +counted. The peer_device objects only serve to establish the links between +devices and connections; their lifetime is determined by the lifetime of the +device and connection which they reference. diff --git a/Documentation/admin-guide/blockdev/drbd/disk-states-8.dot b/Documentation/admin-guide/blockdev/drbd/disk-states-8.dot new file mode 100644 index 000000000000..d06cfb46fb98 --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/disk-states-8.dot @@ -0,0 +1,16 @@ +digraph disk_states { + Diskless -> Inconsistent [ label = "ioctl_set_disk()" ] + Diskless -> Consistent [ label = "ioctl_set_disk()" ] + Diskless -> Outdated [ label = "ioctl_set_disk()" ] + Consistent -> Outdated [ label = "receive_param()" ] + Consistent -> UpToDate [ label = "receive_param()" ] + Consistent -> Inconsistent [ label = "start resync" ] + Outdated -> Inconsistent [ label = "start resync" ] + UpToDate -> Inconsistent [ label = "ioctl_replicate" ] + Inconsistent -> UpToDate [ label = "resync completed" ] + Consistent -> Failed [ label = "io completion error" ] + Outdated -> Failed [ label = "io completion error" ] + UpToDate -> Failed [ label = "io completion error" ] + Inconsistent -> Failed [ label = "io completion error" ] + Failed -> Diskless [ label = "sending notify to peer" ] +} diff --git a/Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot b/Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot new file mode 100644 index 000000000000..6d9cf0a7b11d --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot @@ -0,0 +1,85 @@ +// vim: set sw=2 sts=2 : +digraph { + rankdir=BT + bgcolor=white + + node [shape=plaintext] + node [fontcolor=black] + + StandAlone [ style=filled,fillcolor=gray,label=StandAlone ] + + node [fontcolor=lightgray] + + Unconnected [ label=Unconnected ] + + CommTrouble [ shape=record, + label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ] + + node [fontcolor=gray] + + subgraph cluster_try_connect { + label="try to connect, handshake" + rank=max + WFConnection [ label=WFConnection ] + WFReportParams [ label=WFReportParams ] + } + + TearDown [ label=TearDown ] + + Connected [ label=Connected,style=filled,fillcolor=green,fontcolor=black ] + + node [fontcolor=lightblue] + + StartingSyncS [ label=StartingSyncS ] + StartingSyncT [ label=StartingSyncT ] + + subgraph cluster_bitmap_exchange { + node [fontcolor=red] + fontcolor=red + label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged" + + WFBitMapT [ label=WFBitMapT ] + WFSyncUUID [ label=WFSyncUUID ] + WFBitMapS [ label=WFBitMapS ] + } + + node [fontcolor=blue] + + cluster_resync [ shape=record,label="{resynchronisation process running\l'concurrent' application requests allowed|{{PausedSyncT\nSyncTarget}|{PausedSyncS\nSyncSource}}}" ] + + node [shape=box,fontcolor=black] + + // drbdadm [label="drbdadm connect"] + // handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."] + // comm_error [label="communication trouble"] + + // + // edges + // -------------------------------------- + + StandAlone -> Unconnected [ label="drbdadm connect" ] + Unconnected -> StandAlone [ label="drbdadm disconnect\lor serious communication trouble" ] + Unconnected -> WFConnection [ label="receiver thread is started" ] + WFConnection -> WFReportParams [ headlabel="accept()\land/or \lconnect()\l" ] + + WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ] + WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ] + + WFReportParams -> WFBitMapS + WFReportParams -> WFBitMapT + WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false] + + WFBitMapS -> cluster_resync:S + WFSyncUUID -> cluster_resync:T + + edge [color=green] + cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ] + + edge [color=red] + WFReportParams -> CommTrouble + Connected -> CommTrouble + cluster_resync:any -> CommTrouble + edge [color=black] + CommTrouble -> Unconnected [label="receiver thread is stopped" ] + +} diff --git a/Documentation/admin-guide/blockdev/drbd/figures.rst b/Documentation/admin-guide/blockdev/drbd/figures.rst new file mode 100644 index 000000000000..3e3fd4b8a478 --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/figures.rst @@ -0,0 +1,28 @@ +.. The here included files are intended to help understand the implementation + +Data flows that Relate some functions, and write packets +======================================================== + +.. kernel-figure:: DRBD-8.3-data-packets.svg + :alt: DRBD-8.3-data-packets.svg + :align: center + +.. kernel-figure:: DRBD-data-packets.svg + :alt: DRBD-data-packets.svg + :align: center + + +Sub graphs of DRBD's state transitions +====================================== + +.. kernel-figure:: conn-states-8.dot + :alt: conn-states-8.dot + :align: center + +.. kernel-figure:: disk-states-8.dot + :alt: disk-states-8.dot + :align: center + +.. kernel-figure:: node-states-8.dot + :alt: node-states-8.dot + :align: center diff --git a/Documentation/admin-guide/blockdev/drbd/index.rst b/Documentation/admin-guide/blockdev/drbd/index.rst new file mode 100644 index 000000000000..68ecd5c113e9 --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/index.rst @@ -0,0 +1,19 @@ +========================================== +Distributed Replicated Block Device - DRBD +========================================== + +Description +=========== + + DRBD is a shared-nothing, synchronously replicated block device. It + is designed to serve as a building block for high availability + clusters and in this context, is a "drop-in" replacement for shared + storage. Simplistically, you could see it as a network RAID 1. + + Please visit http://www.drbd.org to find out more. + +.. toctree:: + :maxdepth: 1 + + data-structure-v9 + figures diff --git a/Documentation/admin-guide/blockdev/drbd/node-states-8.dot b/Documentation/admin-guide/blockdev/drbd/node-states-8.dot new file mode 100644 index 000000000000..bfa54e1f8016 --- /dev/null +++ b/Documentation/admin-guide/blockdev/drbd/node-states-8.dot @@ -0,0 +1,13 @@ +digraph node_states { + Secondary -> Primary [ label = "ioctl_set_state()" ] + Primary -> Secondary [ label = "ioctl_set_state()" ] +} + +digraph peer_states { + Secondary -> Primary [ label = "recv state packet" ] + Primary -> Secondary [ label = "recv state packet" ] + Primary -> Unknown [ label = "connection lost" ] + Secondary -> Unknown [ label = "connection lost" ] + Unknown -> Primary [ label = "connected" ] + Unknown -> Secondary [ label = "connected" ] +} diff --git a/Documentation/admin-guide/blockdev/floppy.rst b/Documentation/admin-guide/blockdev/floppy.rst new file mode 100644 index 000000000000..4a8f31cf4139 --- /dev/null +++ b/Documentation/admin-guide/blockdev/floppy.rst @@ -0,0 +1,255 @@ +============= +Floppy Driver +============= + +FAQ list: +========= + +A FAQ list may be found in the fdutils package (see below), and also +at . + + +LILO configuration options (Thinkpad users, read this) +====================================================== + +The floppy driver is configured using the 'floppy=' option in +lilo. This option can be typed at the boot prompt, or entered in the +lilo configuration file. + +Example: If your kernel is called linux-2.6.9, type the following line +at the lilo boot prompt (if you have a thinkpad):: + + linux-2.6.9 floppy=thinkpad + +You may also enter the following line in /etc/lilo.conf, in the description +of linux-2.6.9:: + + append = "floppy=thinkpad" + +Several floppy related options may be given, example:: + + linux-2.6.9 floppy=daring floppy=two_fdc + append = "floppy=daring floppy=two_fdc" + +If you give options both in the lilo config file and on the boot +prompt, the option strings of both places are concatenated, the boot +prompt options coming last. That's why there are also options to +restore the default behavior. + + +Module configuration options +============================ + +If you use the floppy driver as a module, use the following syntax:: + + modprobe floppy floppy="" + +Example:: + + modprobe floppy floppy="omnibook messages" + +If you need certain options enabled every time you load the floppy driver, +you can put:: + + options floppy floppy="omnibook messages" + +in a configuration file in /etc/modprobe.d/. + + +The floppy driver related options are: + + floppy=asus_pci + Sets the bit mask to allow only units 0 and 1. (default) + + floppy=daring + Tells the floppy driver that you have a well behaved floppy controller. + This allows more efficient and smoother operation, but may fail on + certain controllers. This may speed up certain operations. + + floppy=0,daring + Tells the floppy driver that your floppy controller should be used + with caution. + + floppy=one_fdc + Tells the floppy driver that you have only one floppy controller. + (default) + + floppy=two_fdc / floppy=
,two_fdc + Tells the floppy driver that you have two floppy controllers. + The second floppy controller is assumed to be at
. + This option is not needed if the second controller is at address + 0x370, and if you use the 'cmos' option. + + floppy=thinkpad + Tells the floppy driver that you have a Thinkpad. Thinkpads use an + inverted convention for the disk change line. + + floppy=0,thinkpad + Tells the floppy driver that you don't have a Thinkpad. + + floppy=omnibook / floppy=nodma + Tells the floppy driver not to use Dma for data transfers. + This is needed on HP Omnibooks, which don't have a workable + DMA channel for the floppy driver. This option is also useful + if you frequently get "Unable to allocate DMA memory" messages. + Indeed, dma memory needs to be continuous in physical memory, + and is thus harder to find, whereas non-dma buffers may be + allocated in virtual memory. However, I advise against this if + you have an FDC without a FIFO (8272A or 82072). 82072A and + later are OK. You also need at least a 486 to use nodma. + If you use nodma mode, I suggest you also set the FIFO + threshold to 10 or lower, in order to limit the number of data + transfer interrupts. + + If you have a FIFO-able FDC, the floppy driver automatically + falls back on non DMA mode if no DMA-able memory can be found. + If you want to avoid this, explicitly ask for 'yesdma'. + + floppy=yesdma + Tells the floppy driver that a workable DMA channel is available. + (default) + + floppy=nofifo + Disables the FIFO entirely. This is needed if you get "Bus + master arbitration error" messages from your Ethernet card (or + from other devices) while accessing the floppy. + + floppy=usefifo + Enables the FIFO. (default) + + floppy=,fifo_depth + Sets the FIFO threshold. This is mostly relevant in DMA + mode. If this is higher, the floppy driver tolerates more + interrupt latency, but it triggers more interrupts (i.e. it + imposes more load on the rest of the system). If this is + lower, the interrupt latency should be lower too (faster + processor). The benefit of a lower threshold is less + interrupts. + + To tune the fifo threshold, switch on over/underrun messages + using 'floppycontrol --messages'. Then access a floppy + disk. If you get a huge amount of "Over/Underrun - retrying" + messages, then the fifo threshold is too low. Try with a + higher value, until you only get an occasional Over/Underrun. + It is a good idea to compile the floppy driver as a module + when doing this tuning. Indeed, it allows to try different + fifo values without rebooting the machine for each test. Note + that you need to do 'floppycontrol --messages' every time you + re-insert the module. + + Usually, tuning the fifo threshold should not be needed, as + the default (0xa) is reasonable. + + floppy=,,cmos + Sets the CMOS type of to . This is mandatory if + you have more than two floppy drives (only two can be + described in the physical CMOS), or if your BIOS uses + non-standard CMOS types. The CMOS types are: + + == ================================== + 0 Use the value of the physical CMOS + 1 5 1/4 DD + 2 5 1/4 HD + 3 3 1/2 DD + 4 3 1/2 HD + 5 3 1/2 ED + 6 3 1/2 ED + 16 unknown or not installed + == ================================== + + (Note: there are two valid types for ED drives. This is because 5 was + initially chosen to represent floppy *tapes*, and 6 for ED drives. + AMI ignored this, and used 5 for ED drives. That's why the floppy + driver handles both.) + + floppy=unexpected_interrupts + Print a warning message when an unexpected interrupt is received. + (default) + + floppy=no_unexpected_interrupts / floppy=L40SX + Don't print a message when an unexpected interrupt is received. This + is needed on IBM L40SX laptops in certain video modes. (There seems + to be an interaction between video and floppy. The unexpected + interrupts affect only performance, and can be safely ignored.) + + floppy=broken_dcl + Don't use the disk change line, but assume that the disk was + changed whenever the device node is reopened. Needed on some + boxes where the disk change line is broken or unsupported. + This should be regarded as a stopgap measure, indeed it makes + floppy operation less efficient due to unneeded cache + flushings, and slightly more unreliable. Please verify your + cable, connection and jumper settings if you have any DCL + problems. However, some older drives, and also some laptops + are known not to have a DCL. + + floppy=debug + Print debugging messages. + + floppy=messages + Print informational messages for some operations (disk change + notifications, warnings about over and underruns, and about + autodetection). + + floppy=silent_dcl_clear + Uses a less noisy way to clear the disk change line (which + doesn't involve seeks). Implied by 'daring' option. + + floppy=,irq + Sets the floppy IRQ to instead of 6. + + floppy=,dma + Sets the floppy DMA channel to instead of 2. + + floppy=slow + Use PS/2 stepping rate:: + + PS/2 floppies have much slower step rates than regular floppies. + It's been recommended that take about 1/4 of the default speed + in some more extreme cases. + + +Supporting utilities and additional documentation: +================================================== + +Additional parameters of the floppy driver can be configured at +runtime. Utilities which do this can be found in the fdutils package. +This package also contains a new version of mtools which allows to +access high capacity disks (up to 1992K on a high density 3 1/2 disk!). +It also contains additional documentation about the floppy driver. + +The latest version can be found at fdutils homepage: + + http://fdutils.linux.lu + +The fdutils releases can be found at: + + http://fdutils.linux.lu/download.html + + http://www.tux.org/pub/knaff/fdutils/ + + ftp://metalab.unc.edu/pub/Linux/utils/disk-management/ + +Reporting problems about the floppy driver +========================================== + +If you have a question or a bug report about the floppy driver, mail +me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use +comp.os.linux.hardware. As the volume in these groups is rather high, +be sure to include the word "floppy" (or "FLOPPY") in the subject +line. If the reported problem happens when mounting floppy disks, be +sure to mention also the type of the filesystem in the subject line. + +Be sure to read the FAQ before mailing/posting any bug reports! + +Alain + +Changelog +========= + +10-30-2004 : + Cleanup, updating, add reference to module configuration. + James Nelson + +6-3-2000 : + Original Document diff --git a/Documentation/admin-guide/blockdev/index.rst b/Documentation/admin-guide/blockdev/index.rst new file mode 100644 index 000000000000..20a738d9d047 --- /dev/null +++ b/Documentation/admin-guide/blockdev/index.rst @@ -0,0 +1,14 @@ +=========================== +The Linux RapidIO Subsystem +=========================== + +.. toctree:: + :maxdepth: 1 + + floppy + nbd + paride + ramdisk + zram + + drbd/index diff --git a/Documentation/admin-guide/blockdev/nbd.rst b/Documentation/admin-guide/blockdev/nbd.rst new file mode 100644 index 000000000000..d78dfe559dcf --- /dev/null +++ b/Documentation/admin-guide/blockdev/nbd.rst @@ -0,0 +1,31 @@ +================================== +Network Block Device (TCP version) +================================== + +1) Overview +----------- + +What is it: With this compiled in the kernel (or as a module), Linux +can use a remote server as one of its block devices. So every time +the client computer wants to read, e.g., /dev/nb0, it sends a +request over TCP to the server, which will reply with the data read. +This can be used for stations with low disk space (or even diskless) +to borrow disk space from another computer. +Unlike NFS, it is possible to put any filesystem on it, etc. + +For more information, or to download the nbd-client and nbd-server +tools, go to http://nbd.sf.net/. + +The nbd kernel module need only be installed on the client +system, as the nbd-server is completely in userspace. In fact, +the nbd-server has been successfully ported to other operating +systems, including Windows. + +A) NBD parameters +----------------- + +max_part + Number of partitions per device (default: 0). + +nbds_max + Number of block devices that should be initialized (default: 16). diff --git a/Documentation/admin-guide/blockdev/paride.rst b/Documentation/admin-guide/blockdev/paride.rst new file mode 100644 index 000000000000..87b4278bf314 --- /dev/null +++ b/Documentation/admin-guide/blockdev/paride.rst @@ -0,0 +1,439 @@ +=================================== +Linux and parallel port IDE devices +=================================== + +PARIDE v1.03 (c) 1997-8 Grant Guenther + +1. Introduction +=============== + +Owing to the simplicity and near universality of the parallel port interface +to personal computers, many external devices such as portable hard-disk, +CD-ROM, LS-120 and tape drives use the parallel port to connect to their +host computer. While some devices (notably scanners) use ad-hoc methods +to pass commands and data through the parallel port interface, most +external devices are actually identical to an internal model, but with +a parallel-port adapter chip added in. Some of the original parallel port +adapters were little more than mechanisms for multiplexing a SCSI bus. +(The Iomega PPA-3 adapter used in the ZIP drives is an example of this +approach). Most current designs, however, take a different approach. +The adapter chip reproduces a small ISA or IDE bus in the external device +and the communication protocol provides operations for reading and writing +device registers, as well as data block transfer functions. Sometimes, +the device being addressed via the parallel cable is a standard SCSI +controller like an NCR 5380. The "ditto" family of external tape +drives use the ISA replicator to interface a floppy disk controller, +which is then connected to a floppy-tape mechanism. The vast majority +of external parallel port devices, however, are now based on standard +IDE type devices, which require no intermediate controller. If one +were to open up a parallel port CD-ROM drive, for instance, one would +find a standard ATAPI CD-ROM drive, a power supply, and a single adapter +that interconnected a standard PC parallel port cable and a standard +IDE cable. It is usually possible to exchange the CD-ROM device with +any other device using the IDE interface. + +The document describes the support in Linux for parallel port IDE +devices. It does not cover parallel port SCSI devices, "ditto" tape +drives or scanners. Many different devices are supported by the +parallel port IDE subsystem, including: + + - MicroSolutions backpack CD-ROM + - MicroSolutions backpack PD/CD + - MicroSolutions backpack hard-drives + - MicroSolutions backpack 8000t tape drive + - SyQuest EZ-135, EZ-230 & SparQ drives + - Avatar Shark + - Imation Superdisk LS-120 + - Maxell Superdisk LS-120 + - FreeCom Power CD + - Hewlett-Packard 5GB and 8GB tape drives + - Hewlett-Packard 7100 and 7200 CD-RW drives + +as well as most of the clone and no-name products on the market. + +To support such a wide range of devices, PARIDE, the parallel port IDE +subsystem, is actually structured in three parts. There is a base +paride module which provides a registry and some common methods for +accessing the parallel ports. The second component is a set of +high-level drivers for each of the different types of supported devices: + + === ============= + pd IDE disk + pcd ATAPI CD-ROM + pf ATAPI disk + pt ATAPI tape + pg ATAPI generic + === ============= + +(Currently, the pg driver is only used with CD-R drives). + +The high-level drivers function according to the relevant standards. +The third component of PARIDE is a set of low-level protocol drivers +for each of the parallel port IDE adapter chips. Thanks to the interest +and encouragement of Linux users from many parts of the world, +support is available for almost all known adapter protocols: + + ==== ====================================== ==== + aten ATEN EH-100 (HK) + bpck Microsolutions backpack (US) + comm DataStor (old-type) "commuter" adapter (TW) + dstr DataStor EP-2000 (TW) + epat Shuttle EPAT (UK) + epia Shuttle EPIA (UK) + fit2 FIT TD-2000 (US) + fit3 FIT TD-3000 (US) + friq Freecom IQ cable (DE) + frpw Freecom Power (DE) + kbic KingByte KBIC-951A and KBIC-971A (TW) + ktti KT Technology PHd adapter (SG) + on20 OnSpec 90c20 (US) + on26 OnSpec 90c26 (US) + ==== ====================================== ==== + + +2. Using the PARIDE subsystem +============================= + +While configuring the Linux kernel, you may choose either to build +the PARIDE drivers into your kernel, or to build them as modules. + +In either case, you will need to select "Parallel port IDE device support" +as well as at least one of the high-level drivers and at least one +of the parallel port communication protocols. If you do not know +what kind of parallel port adapter is used in your drive, you could +begin by checking the file names and any text files on your DOS +installation floppy. Alternatively, you can look at the markings on +the adapter chip itself. That's usually sufficient to identify the +correct device. + +You can actually select all the protocol modules, and allow the PARIDE +subsystem to try them all for you. + +For the "brand-name" products listed above, here are the protocol +and high-level drivers that you would use: + + ================ ============ ====== ======== + Manufacturer Model Driver Protocol + ================ ============ ====== ======== + MicroSolutions CD-ROM pcd bpck + MicroSolutions PD drive pf bpck + MicroSolutions hard-drive pd bpck + MicroSolutions 8000t tape pt bpck + SyQuest EZ, SparQ pd epat + Imation Superdisk pf epat + Maxell Superdisk pf friq + Avatar Shark pd epat + FreeCom CD-ROM pcd frpw + Hewlett-Packard 5GB Tape pt epat + Hewlett-Packard 7200e (CD) pcd epat + Hewlett-Packard 7200e (CD-R) pg epat + ================ ============ ====== ======== + +2.1 Configuring built-in drivers +--------------------------------- + +We recommend that you get to know how the drivers work and how to +configure them as loadable modules, before attempting to compile a +kernel with the drivers built-in. + +If you built all of your PARIDE support directly into your kernel, +and you have just a single parallel port IDE device, your kernel should +locate it automatically for you. If you have more than one device, +you may need to give some command line options to your bootloader +(eg: LILO), how to do that is beyond the scope of this document. + +The high-level drivers accept a number of command line parameters, all +of which are documented in the source files in linux/drivers/block/paride. +By default, each driver will automatically try all parallel ports it +can find, and all protocol types that have been installed, until it finds +a parallel port IDE adapter. Once it finds one, the probe stops. So, +if you have more than one device, you will need to tell the drivers +how to identify them. This requires specifying the port address, the +protocol identification number and, for some devices, the drive's +chain ID. While your system is booting, a number of messages are +displayed on the console. Like all such messages, they can be +reviewed with the 'dmesg' command. Among those messages will be +some lines like:: + + paride: bpck registered as protocol 0 + paride: epat registered as protocol 1 + +The numbers will always be the same until you build a new kernel with +different protocol selections. You should note these numbers as you +will need them to identify the devices. + +If you happen to be using a MicroSolutions backpack device, you will +also need to know the unit ID number for each drive. This is usually +the last two digits of the drive's serial number (but read MicroSolutions' +documentation about this). + +As an example, let's assume that you have a MicroSolutions PD/CD drive +with unit ID number 36 connected to the parallel port at 0x378, a SyQuest +EZ-135 connected to the chained port on the PD/CD drive and also an +Imation Superdisk connected to port 0x278. You could give the following +options on your boot command:: + + pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36 + +In the last option, pf.drive1 configures device /dev/pf1, the 0x378 +is the parallel port base address, the 0 is the protocol registration +number and 36 is the chain ID. + +Please note: while PARIDE will work both with and without the +PARPORT parallel port sharing system that is included by the +"Parallel port support" option, PARPORT must be included and enabled +if you want to use chains of devices on the same parallel port. + +2.2 Loading and configuring PARIDE as modules +---------------------------------------------- + +It is much faster and simpler to get to understand the PARIDE drivers +if you use them as loadable kernel modules. + +Note 1: + using these drivers with the "kerneld" automatic module loading + system is not recommended for beginners, and is not documented here. + +Note 2: + if you build PARPORT support as a loadable module, PARIDE must + also be built as loadable modules, and PARPORT must be loaded before + the PARIDE modules. + +To use PARIDE, you must begin by:: + + insmod paride + +this loads a base module which provides a registry for the protocols, +among other tasks. + +Then, load as many of the protocol modules as you think you might need. +As you load each module, it will register the protocols that it supports, +and print a log message to your kernel log file and your console. For +example:: + + # insmod epat + paride: epat registered as protocol 0 + # insmod kbic + paride: k951 registered as protocol 1 + paride: k971 registered as protocol 2 + +Finally, you can load high-level drivers for each kind of device that +you have connected. By default, each driver will autoprobe for a single +device, but you can support up to four similar devices by giving their +individual co-ordinates when you load the driver. + +For example, if you had two no-name CD-ROM drives both using the +KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc +you could give the following command:: + + # insmod pcd drive0=0x378,1 drive1=0x3bc,1 + +For most adapters, giving a port address and protocol number is sufficient, +but check the source files in linux/drivers/block/paride for more +information. (Hopefully someone will write some man pages one day !). + +As another example, here's what happens when PARPORT is installed, and +a SyQuest EZ-135 is attached to port 0x378:: + + # insmod paride + paride: version 1.0 installed + # insmod epat + paride: epat registered as protocol 0 + # insmod pd + pd: pd version 1.0, major 45, cluster 64, nice 0 + pda: Sharing parport1 at 0x378 + pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1 + pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media + pda: pda1 + +Note that the last line is the output from the generic partition table +scanner - in this case it reports that it has found a disk with one partition. + +2.3 Using a PARIDE device +-------------------------- + +Once the drivers have been loaded, you can access PARIDE devices in the +same way as their traditional counterparts. You will probably need to +create the device "special files". Here is a simple script that you can +cut to a file and execute:: + + #!/bin/bash + # + # mkd -- a script to create the device special files for the PARIDE subsystem + # + function mkdev { + mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1 + } + # + function pd { + D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) ) + mkdev pd$D b 45 $[ $1 * 16 ] + for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + do mkdev pd$D$P b 45 $[ $1 * 16 + $P ] + done + } + # + cd /dev + # + for u in 0 1 2 3 ; do pd $u ; done + for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done + for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done + for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done + for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done + for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done + # + # end of mkd + +With the device files and drivers in place, you can access PARIDE devices +like any other Linux device. For example, to mount a CD-ROM in pcd0, use:: + + mount /dev/pcd0 /cdrom + +If you have a fresh Avatar Shark cartridge, and the drive is pda, you +might do something like:: + + fdisk /dev/pda -- make a new partition table with + partition 1 of type 83 + + mke2fs /dev/pda1 -- to build the file system + + mkdir /shark -- make a place to mount the disk + + mount /dev/pda1 /shark + +Devices like the Imation superdisk work in the same way, except that +they do not have a partition table. For example to make a 120MB +floppy that you could share with a DOS system:: + + mkdosfs /dev/pf0 + mount /dev/pf0 /mnt + + +2.4 The pf driver +------------------ + +The pf driver is intended for use with parallel port ATAPI disk +devices. The most common devices in this category are PD drives +and LS-120 drives. Traditionally, media for these devices are not +partitioned. Consequently, the pf driver does not support partitioned +media. This may be changed in a future version of the driver. + +2.5 Using the pt driver +------------------------ + +The pt driver for parallel port ATAPI tape drives is a minimal driver. +It does not yet support many of the standard tape ioctl operations. +For best performance, a block size of 32KB should be used. You will +probably want to set the parallel port delay to 0, if you can. + +2.6 Using the pg driver +------------------------ + +The pg driver can be used in conjunction with the cdrecord program +to create CD-ROMs. Please get cdrecord version 1.6.1 or later +from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media +your parallel port should ideally be set to EPP mode, and the "port delay" +should be set to 0. With those settings it is possible to record at 2x +speed without any buffer underruns. If you cannot get the driver to work +in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only. + + +3. Troubleshooting +================== + +3.1 Use EPP mode if you can +---------------------------- + +The most common problems that people report with the PARIDE drivers +concern the parallel port CMOS settings. At this time, none of the +PARIDE protocol modules support ECP mode, or any ECP combination modes. +If you are able to do so, please set your parallel port into EPP mode +using your CMOS setup procedure. + +3.2 Check the port delay +------------------------- + +Some parallel ports cannot reliably transfer data at full speed. To +offset the errors, the PARIDE protocol modules introduce a "port +delay" between each access to the i/o ports. Each protocol sets +a default value for this delay. In most cases, the user can override +the default and set it to 0 - resulting in somewhat higher transfer +rates. In some rare cases (especially with older 486 systems) the +default delays are not long enough. if you experience corrupt data +transfers, or unexpected failures, you may wish to increase the +port delay. The delay can be programmed using the "driveN" parameters +to each of the high-level drivers. Please see the notes above, or +read the comments at the beginning of the driver source files in +linux/drivers/block/paride. + +3.3 Some drives need a printer reset +------------------------------------- + +There appear to be a number of "noname" external drives on the market +that do not always power up correctly. We have noticed this with some +drives based on OnSpec and older Freecom adapters. In these rare cases, +the adapter can often be reinitialised by issuing a "printer reset" on +the parallel port. As the reset operation is potentially disruptive in +multiple device environments, the PARIDE drivers will not do it +automatically. You can however, force a printer reset by doing:: + + insmod lp reset=1 + rmmod lp + +If you have one of these marginal cases, you should probably build +your paride drivers as modules, and arrange to do the printer reset +before loading the PARIDE drivers. + +3.4 Use the verbose option and dmesg if you need help +------------------------------------------------------ + +While a lot of testing has gone into these drivers to make them work +as smoothly as possible, problems will arise. If you do have problems, +please check all the obvious things first: does the drive work in +DOS with the manufacturer's drivers ? If that doesn't yield any useful +clues, then please make sure that only one drive is hooked to your system, +and that either (a) PARPORT is enabled or (b) no other device driver +is using your parallel port (check in /proc/ioports). Then, load the +appropriate drivers (you can load several protocol modules if you want) +as in:: + + # insmod paride + # insmod epat + # insmod bpck + # insmod kbic + ... + # insmod pd verbose=1 + +(using the correct driver for the type of device you have, of course). +The verbose=1 parameter will cause the drivers to log a trace of their +activity as they attempt to locate your drive. + +Use 'dmesg' to capture a log of all the PARIDE messages (any messages +beginning with paride:, a protocol module's name or a driver's name) and +include that with your bug report. You can submit a bug report in one +of two ways. Either send it directly to the author of the PARIDE suite, +by e-mail to grant@torque.net, or join the linux-parport mailing list +and post your report there. + +3.5 For more information or help +--------------------------------- + +You can join the linux-parport mailing list by sending a mail message +to: + + linux-parport-request@torque.net + +with the single word:: + + subscribe + +in the body of the mail message (not in the subject line). Please be +sure that your mail program is correctly set up when you do this, as +the list manager is a robot that will subscribe you using the reply +address in your mail headers. REMOVE any anti-spam gimmicks you may +have in your mail headers, when sending mail to the list server. + +You might also find some useful information on the linux-parport +web pages (although they are not always up to date) at + + http://web.archive.org/web/%2E/http://www.torque.net/parport/ diff --git a/Documentation/admin-guide/blockdev/ramdisk.rst b/Documentation/admin-guide/blockdev/ramdisk.rst new file mode 100644 index 000000000000..b7c2268f8dec --- /dev/null +++ b/Documentation/admin-guide/blockdev/ramdisk.rst @@ -0,0 +1,177 @@ +========================================== +Using the RAM disk block device with Linux +========================================== + +.. Contents: + + 1) Overview + 2) Kernel Command Line Parameters + 3) Using "rdev -r" + 4) An Example of Creating a Compressed RAM Disk + + +1) Overview +----------- + +The RAM disk driver is a way to use main system memory as a block device. It +is required for initrd, an initial filesystem used if you need to load modules +in order to access the root filesystem (see Documentation/admin-guide/initrd.rst). It can +also be used for a temporary filesystem for crypto work, since the contents +are erased on reboot. + +The RAM disk dynamically grows as more space is required. It does this by using +RAM from the buffer cache. The driver marks the buffers it is using as dirty +so that the VM subsystem does not try to reclaim them later. + +The RAM disk supports up to 16 RAM disks by default, and can be reconfigured +to support an unlimited number of RAM disks (at your own risk). Just change +the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu +and (re)build the kernel. + +To use RAM disk support with your system, run './MAKEDEV ram' from the /dev +directory. RAM disks are all major number 1, and start with minor number 0 +for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd. + +The new RAM disk also has the ability to load compressed RAM disk images, +allowing one to squeeze more programs onto an average installation or +rescue floppy disk. + + +2) Parameters +--------------------------------- + +2a) Kernel Command Line Parameters + + ramdisk_size=N + Size of the ramdisk. + +This parameter tells the RAM disk driver to set up RAM disks of N k size. The +default is 4096 (4 MB). + +2b) Module parameters + + rd_nr + /dev/ramX devices created. + + max_part + Maximum partition number. + + rd_size + See ramdisk_size. + +3) Using "rdev -r" +------------------ + +The usage of the word (two bytes) that "rdev -r" sets in the kernel image is +as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up +to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit +14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a +prompt/wait sequence is to be given before trying to read the RAM disk. Since +the RAM disk dynamically grows as data is being written into it, a size field +is not required. Bits 11 to 13 are not currently used and may as well be zero. +These numbers are no magical secrets, as seen below:: + + ./arch/x86/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK 0x07FF + ./arch/x86/kernel/setup.c:#define RAMDISK_PROMPT_FLAG 0x8000 + ./arch/x86/kernel/setup.c:#define RAMDISK_LOAD_FLAG 0x4000 + +Consider a typical two floppy disk setup, where you will have the +kernel on disk one, and have already put a RAM disk image onto disk #2. + +Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk +starts at an offset of 0 kB from the beginning of the floppy. +The command line equivalent is: "ramdisk_start=0" + +You want bit 14 as one, indicating that a RAM disk is to be loaded. +The command line equivalent is: "load_ramdisk=1" + +You want bit 15 as one, indicating that you want a prompt/keypress +sequence so that you have a chance to switch floppy disks. +The command line equivalent is: "prompt_ramdisk=1" + +Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word. +So to create disk one of the set, you would do:: + + /usr/src/linux# cat arch/x86/boot/zImage > /dev/fd0 + /usr/src/linux# rdev /dev/fd0 /dev/fd0 + /usr/src/linux# rdev -r /dev/fd0 49152 + +If you make a boot disk that has LILO, then for the above, you would use:: + + append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1" + +Since the default start = 0 and the default prompt = 1, you could use:: + + append = "load_ramdisk=1" + + +4) An Example of Creating a Compressed RAM Disk +----------------------------------------------- + +To create a RAM disk image, you will need a spare block device to +construct it on. This can be the RAM disk device itself, or an +unused disk partition (such as an unmounted swap partition). For this +example, we will use the RAM disk device, "/dev/ram0". + +Note: This technique should not be done on a machine with less than 8 MB +of RAM. If using a spare disk partition instead of /dev/ram0, then this +restriction does not apply. + +a) Decide on the RAM disk size that you want. Say 2 MB for this example. + Create it by writing to the RAM disk device. (This step is not currently + required, but may be in the future.) It is wise to zero out the + area (esp. for disks) so that maximal compression is achieved for + the unused blocks of the image that you are about to create:: + + dd if=/dev/zero of=/dev/ram0 bs=1k count=2048 + +b) Make a filesystem on it. Say ext2fs for this example:: + + mke2fs -vm0 /dev/ram0 2048 + +c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...) + and unmount it again. + +d) Compress the contents of the RAM disk. The level of compression + will be approximately 50% of the space used by the files. Unused + space on the RAM disk will compress to almost nothing:: + + dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz + +e) Put the kernel onto the floppy:: + + dd if=zImage of=/dev/fd0 bs=1k + +f) Put the RAM disk image onto the floppy, after the kernel. Use an offset + that is slightly larger than the kernel, so that you can put another + (possibly larger) kernel onto the same floppy later without overlapping + the RAM disk image. An offset of 400 kB for kernels about 350 kB in + size would be reasonable. Make sure offset+size of ram_image.gz is + not larger than the total space on your floppy (usually 1440 kB):: + + dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400 + +g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc. + For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would + have 2^15 + 2^14 + 400 = 49552:: + + rdev /dev/fd0 /dev/fd0 + rdev -r /dev/fd0 49552 + +That is it. You now have your boot/root compressed RAM disk floppy. Some +users may wish to combine steps (d) and (f) by using a pipe. + + + Paul Gortmaker 12/95 + +Changelog: +---------- + +10-22-04 : + Updated to reflect changes in command line options, remove + obsolete references, general cleanup. + James Nelson (james4765@gmail.com) + + +12-95 : + Original Document diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst new file mode 100644 index 000000000000..6eccf13219ff --- /dev/null +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -0,0 +1,422 @@ +======================================== +zram: Compressed RAM based block devices +======================================== + +Introduction +============ + +The zram module creates RAM based block devices named /dev/zram +( = 0, 1, ...). Pages written to these disks are compressed and stored +in memory itself. These disks allow very fast I/O and compression provides +good amounts of memory savings. Some of the usecases include /tmp storage, +use as swap disks, various caches under /var and maybe many more :) + +Statistics for individual zram devices are exported through sysfs nodes at +/sys/block/zram/ + +Usage +===== + +There are several ways to configure and manage zram device(-s): + +a) using zram and zram_control sysfs attributes +b) using zramctl utility, provided by util-linux (util-linux@vger.kernel.org). + +In this document we will describe only 'manual' zram configuration steps, +IOW, zram and zram_control sysfs attributes. + +In order to get a better idea about zramctl please consult util-linux +documentation, zramctl man-page or `zramctl --help`. Please be informed +that zram maintainers do not develop/maintain util-linux or zramctl, should +you have any questions please contact util-linux@vger.kernel.org + +Following shows a typical sequence of steps for using zram. + +WARNING +======= + +For the sake of simplicity we skip error checking parts in most of the +examples below. However, it is your sole responsibility to handle errors. + +zram sysfs attributes always return negative values in case of errors. +The list of possible return codes: + +======== ============================================================= +-EBUSY an attempt to modify an attribute that cannot be changed once + the device has been initialised. Please reset device first; +-ENOMEM zram was not able to allocate enough memory to fulfil your + needs; +-EINVAL invalid input has been provided. +======== ============================================================= + +If you use 'echo', the returned value that is changed by 'echo' utility, +and, in general case, something like:: + + echo 3 > /sys/block/zram0/max_comp_streams + if [ $? -ne 0 ]; + handle_error + fi + +should suffice. + +1) Load Module +============== + +:: + + modprobe zram num_devices=4 + This creates 4 devices: /dev/zram{0,1,2,3} + +num_devices parameter is optional and tells zram how many devices should be +pre-created. Default: 1. + +2) Set max number of compression streams +======================================== + +Regardless the value passed to this attribute, ZRAM will always +allocate multiple compression streams - one per online CPUs - thus +allowing several concurrent compression operations. The number of +allocated compression streams goes down when some of the CPUs +become offline. There is no single-compression-stream mode anymore, +unless you are running a UP system or has only 1 CPU online. + +To find out how many streams are currently available:: + + cat /sys/block/zram0/max_comp_streams + +3) Select compression algorithm +=============================== + +Using comp_algorithm device attribute one can see available and +currently selected (shown in square brackets) compression algorithms, +change selected compression algorithm (once the device is initialised +there is no way to change compression algorithm). + +Examples:: + + #show supported compression algorithms + cat /sys/block/zram0/comp_algorithm + lzo [lz4] + + #select lzo compression algorithm + echo lzo > /sys/block/zram0/comp_algorithm + +For the time being, the `comp_algorithm` content does not necessarily +show every compression algorithm supported by the kernel. We keep this +list primarily to simplify device configuration and one can configure +a new device with a compression algorithm that is not listed in +`comp_algorithm`. The thing is that, internally, ZRAM uses Crypto API +and, if some of the algorithms were built as modules, it's impossible +to list all of them using, for instance, /proc/crypto or any other +method. This, however, has an advantage of permitting the usage of +custom crypto compression modules (implementing S/W or H/W compression). + +4) Set Disksize +=============== + +Set disk size by writing the value to sysfs node 'disksize'. +The value can be either in bytes or you can use mem suffixes. +Examples:: + + # Initialize /dev/zram0 with 50MB disksize + echo $((50*1024*1024)) > /sys/block/zram0/disksize + + # Using mem suffixes + echo 256K > /sys/block/zram0/disksize + echo 512M > /sys/block/zram0/disksize + echo 1G > /sys/block/zram0/disksize + +Note: +There is little point creating a zram of greater than twice the size of memory +since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the +size of the disk when not in use so a huge zram is wasteful. + +5) Set memory limit: Optional +============================= + +Set memory limit by writing the value to sysfs node 'mem_limit'. +The value can be either in bytes or you can use mem suffixes. +In addition, you could change the value in runtime. +Examples:: + + # limit /dev/zram0 with 50MB memory + echo $((50*1024*1024)) > /sys/block/zram0/mem_limit + + # Using mem suffixes + echo 256K > /sys/block/zram0/mem_limit + echo 512M > /sys/block/zram0/mem_limit + echo 1G > /sys/block/zram0/mem_limit + + # To disable memory limit + echo 0 > /sys/block/zram0/mem_limit + +6) Activate +=========== + +:: + + mkswap /dev/zram0 + swapon /dev/zram0 + + mkfs.ext4 /dev/zram1 + mount /dev/zram1 /tmp + +7) Add/remove zram devices +========================== + +zram provides a control interface, which enables dynamic (on-demand) device +addition and removal. + +In order to add a new /dev/zramX device, perform read operation on hot_add +attribute. This will return either new device's device id (meaning that you +can use /dev/zram) or error code. + +Example:: + + cat /sys/class/zram-control/hot_add + 1 + +To remove the existing /dev/zramX device (where X is a device id) +execute:: + + echo X > /sys/class/zram-control/hot_remove + +8) Stats +======== + +Per-device statistics are exported as various nodes under /sys/block/zram/ + +A brief description of exported device attributes. For more details please +read Documentation/ABI/testing/sysfs-block-zram. + +====================== ====== =============================================== +Name access description +====================== ====== =============================================== +disksize RW show and set the device's disk size +initstate RO shows the initialization state of the device +reset WO trigger device reset +mem_used_max WO reset the `mem_used_max` counter (see later) +mem_limit WO specifies the maximum amount of memory ZRAM can + use to store the compressed data +writeback_limit WO specifies the maximum amount of write IO zram + can write out to backing device as 4KB unit +writeback_limit_enable RW show and set writeback_limit feature +max_comp_streams RW the number of possible concurrent compress + operations +comp_algorithm RW show and change the compression algorithm +compact WO trigger memory compaction +debug_stat RO this file is used for zram debugging purposes +backing_dev RW set up backend storage for zram to write out +idle WO mark allocated slot as idle +====================== ====== =============================================== + + +User space is advised to use the following files to read the device statistics. + +File /sys/block/zram/stat + +Represents block layer statistics. Read Documentation/block/stat.rst for +details. + +File /sys/block/zram/io_stat + +The stat file represents device's I/O statistics not accounted by block +layer and, thus, not available in zram/stat file. It consists of a +single line of text and contains the following stats separated by +whitespace: + + ============= ============================================================= + failed_reads The number of failed reads + failed_writes The number of failed writes + invalid_io The number of non-page-size-aligned I/O requests + notify_free Depending on device usage scenario it may account + + a) the number of pages freed because of swap slot free + notifications + b) the number of pages freed because of + REQ_OP_DISCARD requests sent by bio. The former ones are + sent to a swap block device when a swap slot is freed, + which implies that this disk is being used as a swap disk. + + The latter ones are sent by filesystem mounted with + discard option, whenever some data blocks are getting + discarded. + ============= ============================================================= + +File /sys/block/zram/mm_stat + +The stat file represents device's mm statistics. It consists of a single +line of text and contains the following stats separated by whitespace: + + ================ ============================================================= + orig_data_size uncompressed size of data stored in this disk. + This excludes same-element-filled pages (same_pages) since + no memory is allocated for them. + Unit: bytes + compr_data_size compressed size of data stored in this disk + mem_used_total the amount of memory allocated for this disk. This + includes allocator fragmentation and metadata overhead, + allocated for this disk. So, allocator space efficiency + can be calculated using compr_data_size and this statistic. + Unit: bytes + mem_limit the maximum amount of memory ZRAM can use to store + the compressed data + mem_used_max the maximum amount of memory zram have consumed to + store the data + same_pages the number of same element filled pages written to this disk. + No memory is allocated for such pages. + pages_compacted the number of pages freed during compaction + huge_pages the number of incompressible pages + ================ ============================================================= + +File /sys/block/zram/bd_stat + +The stat file represents device's backing device statistics. It consists of +a single line of text and contains the following stats separated by whitespace: + + ============== ============================================================= + bd_count size of data written in backing device. + Unit: 4K bytes + bd_reads the number of reads from backing device + Unit: 4K bytes + bd_writes the number of writes to backing device + Unit: 4K bytes + ============== ============================================================= + +9) Deactivate +============= + +:: + + swapoff /dev/zram0 + umount /dev/zram1 + +10) Reset +========= + + Write any positive value to 'reset' sysfs node:: + + echo 1 > /sys/block/zram0/reset + echo 1 > /sys/block/zram1/reset + + This frees all the memory allocated for the given device and + resets the disksize to zero. You must set the disksize again + before reusing the device. + +Optional Feature +================ + +writeback +--------- + +With CONFIG_ZRAM_WRITEBACK, zram can write idle/incompressible page +to backing storage rather than keeping it in memory. +To use the feature, admin should set up backing device via:: + + echo /dev/sda5 > /sys/block/zramX/backing_dev + +before disksize setting. It supports only partition at this moment. +If admin want to use incompressible page writeback, they could do via:: + + echo huge > /sys/block/zramX/write + +To use idle page writeback, first, user need to declare zram pages +as idle:: + + echo all > /sys/block/zramX/idle + +From now on, any pages on zram are idle pages. The idle mark +will be removed until someone request access of the block. +IOW, unless there is access request, those pages are still idle pages. + +Admin can request writeback of those idle pages at right timing via:: + + echo idle > /sys/block/zramX/writeback + +With the command, zram writeback idle pages from memory to the storage. + +If there are lots of write IO with flash device, potentially, it has +flash wearout problem so that admin needs to design write limitation +to guarantee storage health for entire product life. + +To overcome the concern, zram supports "writeback_limit" feature. +The "writeback_limit_enable"'s default value is 0 so that it doesn't limit +any writeback. IOW, if admin want to apply writeback budget, he should +enable writeback_limit_enable via:: + + $ echo 1 > /sys/block/zramX/writeback_limit_enable + +Once writeback_limit_enable is set, zram doesn't allow any writeback +until admin set the budget via /sys/block/zramX/writeback_limit. + +(If admin doesn't enable writeback_limit_enable, writeback_limit's value +assigned via /sys/block/zramX/writeback_limit is meaninless.) + +If admin want to limit writeback as per-day 400M, he could do it +like below:: + + $ MB_SHIFT=20 + $ 4K_SHIFT=12 + $ echo $((400<>4K_SHIFT)) > \ + /sys/block/zram0/writeback_limit. + $ echo 1 > /sys/block/zram0/writeback_limit_enable + +If admin want to allow further write again once the bugdet is exausted, +he could do it like below:: + + $ echo $((400<>4K_SHIFT)) > \ + /sys/block/zram0/writeback_limit + +If admin want to see remaining writeback budget since he set:: + + $ cat /sys/block/zramX/writeback_limit + +If admin want to disable writeback limit, he could do:: + + $ echo 0 > /sys/block/zramX/writeback_limit_enable + +The writeback_limit count will reset whenever you reset zram(e.g., +system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of +writeback happened until you reset the zram to allocate extra writeback +budget in next setting is user's job. + +If admin want to measure writeback count in a certain period, he could +know it via /sys/block/zram0/bd_stat's 3rd column. + +memory tracking +=============== + +With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the +zram block. It could be useful to catch cold or incompressible +pages of the process with*pagemap. + +If you enable the feature, you could see block state via +/sys/kernel/debug/zram/zram0/block_state". The output is as follows:: + + 300 75.033841 .wh. + 301 63.806904 s... + 302 63.806919 ..hi + +First column + zram's block index. +Second column + access time since the system was booted +Third column + state of the block: + + s: + same page + w: + written page to backing store + h: + huge page + i: + idle page + +First line of above example says 300th block is accessed at 75.033841sec +and the block's state is huge so it is written back to the backing +storage. It's a debugging feature so anyone shouldn't rely on it to work +properly. + +Nitin Gupta +ngupta@vflare.org diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 5b63182ceb5f..9228fbf5ce4e 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -73,6 +73,7 @@ configure specific aspects of kernel behavior to your liking. java ras bcache + blockdev/index ext4 binderfs pm/index diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e645b3ab4b6f..78576aa45cce 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1249,7 +1249,7 @@ See also Documentation/fault-injection/. floppy= [HW] - See Documentation/blockdev/floppy.rst. + See Documentation/admin-guide/blockdev/floppy.rst. force_pal_cache_flush [IA-64] Avoid check_sal_cache_flush which may hang on @@ -2234,7 +2234,7 @@ memblock=debug [KNL] Enable memblock debug messages. load_ramdisk= [RAM] List of ramdisks to load from floppy - See Documentation/blockdev/ramdisk.rst. + See Documentation/admin-guide/blockdev/ramdisk.rst. lockd.nlm_grace_period=P [NFS] Assign grace period. Format: @@ -3268,7 +3268,7 @@ pcd. [PARIDE] See header of drivers/block/paride/pcd.c. - See also Documentation/blockdev/paride.rst. + See also Documentation/admin-guide/blockdev/paride.rst. pci=option[,option...] [PCI] various PCI subsystem options. @@ -3512,7 +3512,7 @@ needed on a platform with proper driver support. pd. [PARIDE] - See Documentation/blockdev/paride.rst. + See Documentation/admin-guide/blockdev/paride.rst. pdcchassis= [PARISC,HW] Disable/Enable PDC Chassis Status codes at boot time. @@ -3527,10 +3527,10 @@ and performance comparison. pf. [PARIDE] - See Documentation/blockdev/paride.rst. + See Documentation/admin-guide/blockdev/paride.rst. pg. [PARIDE] - See Documentation/blockdev/paride.rst. + See Documentation/admin-guide/blockdev/paride.rst. pirq= [SMP,APIC] Manual mp-table setup See Documentation/x86/i386/IO-APIC.rst. @@ -3642,7 +3642,7 @@ prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk before loading. - See Documentation/blockdev/ramdisk.rst. + See Documentation/admin-guide/blockdev/ramdisk.rst. psi= [KNL] Enable or disable pressure stall information tracking. @@ -3664,7 +3664,7 @@ pstore.backend= Specify the name of the pstore backend to use pt. [PARIDE] - See Documentation/blockdev/paride.rst. + See Documentation/admin-guide/blockdev/paride.rst. pti= [X86_64] Control Page Table Isolation of user and kernel address spaces. Disabling this feature @@ -3693,7 +3693,7 @@ See Documentation/admin-guide/md.rst. ramdisk_size= [RAM] Sizes of RAM disks in kilobytes - See Documentation/blockdev/ramdisk.rst. + See Documentation/admin-guide/blockdev/ramdisk.rst. random.trust_cpu={on,off} [KNL] Enable or disable trusting the use of the diff --git a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg b/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg deleted file mode 100644 index f87cfa0dc2fb..000000000000 --- a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg +++ /dev/null @@ -1,588 +0,0 @@ - - - - - - Master slide - - - - - - - - - - RSDataReply - - - - - - - CsumRSRequest - - - - w_make_resync_request() - - - receive_DataRequest() - - - drbd_endio_read_sec() - - - w_e_end_csum_rs_req() - - - receive_RSDataReply() - - - drbd_endio_write_sec() - - - e_end_resync_block() - - - - - - WriteAck - - - - got_BlockAck() - - - Checksum based Resync, case not in sync - - - DRBD-8.3 data flow - - - w_e_send_csum() - - - - - - - - RSIsInSync - - - - - - - CsumRSRequest - - - - receive_DataRequest() - - - drbd_endio_read_sec() - - - w_e_end_csum_rs_req() - - - got_IsInSync() - - - Checksum based Resync, case in sync - - - - - - - - - - OVReply - - - - - - - OVRequest - - - - receive_OVRequest() - - - drbd_endio_read_sec() - - - w_e_end_ov_req() - - - receive_OVReply() - - - drbd_endio_read_sec() - - - w_e_end_ov_reply() - - - - - - OVResult - - - - got_OVResult() - - - Online verify - - - w_make_ov_request() - - - - - - - - drbd_endio_read_sec() - - - w_make_resync_request() - - - w_e_send_csum() - - - - - drbd_endio_read_sec() - - - - - - rs_begin_io() - - - rs_begin_io() - - - rs_begin_io() - - - rs_complete_io() - - - rs_complete_io() - - - rs_complete_io() - - - rs_begin_io() - - - rs_begin_io() - - - rs_begin_io() - - - rs_complete_io() - - - rs_complete_io() - - - rs_complete_io() - - diff --git a/Documentation/blockdev/drbd/DRBD-data-packets.svg b/Documentation/blockdev/drbd/DRBD-data-packets.svg deleted file mode 100644 index 48a1e2165fec..000000000000 --- a/Documentation/blockdev/drbd/DRBD-data-packets.svg +++ /dev/null @@ -1,459 +0,0 @@ - - - - - - Master slide - - - - - - - - - RSDataReply - - - - - RSDataRequest - - - w_make_resync_request() - - - receive_DataRequest() - - - drbd_endio_read_sec() - - - w_e_end_rsdata_req() - - - receive_RSDataReply() - - - drbd_endio_write_sec() - - - e_end_resync_block() - - - - - WriteAck - - - got_BlockAck() - - - Resync blocks, 4-32K - - - - - - - WriteAck - - - - - Data - - - drbd_make_request() - - - receive_Data() - - - drbd_endio_write_sec() - - - e_end_block() - - - got_BlockAck() - - - Regular mirrored write, 512-32K - - - w_send_dblock() - - - - - drbd_endio_write_pri() - - - - - - - DataReply - - - - - DataRequest - - - drbd_make_request() - - - receive_DataRequest() - - - drbd_endio_read_sec() - - - w_e_end_data_req() - - - Drawing - - receive_DataReply() - - - - Diskless read, 512-32K - - - w_send_read_req() - - - DRBD 8 data flow - - - - - - al_begin_io() - - - al_complete_io() - - - rs_begin_io() - - - rs_complete_io() - - - rs_begin_io() - - - rs_complete_io() - - diff --git a/Documentation/blockdev/drbd/conn-states-8.dot b/Documentation/blockdev/drbd/conn-states-8.dot deleted file mode 100644 index 025e8cf5e64a..000000000000 --- a/Documentation/blockdev/drbd/conn-states-8.dot +++ /dev/null @@ -1,18 +0,0 @@ -digraph conn_states { - StandAllone -> WFConnection [ label = "ioctl_set_net()" ] - WFConnection -> Unconnected [ label = "unable to bind()" ] - WFConnection -> WFReportParams [ label = "in connect() after accept" ] - WFReportParams -> StandAllone [ label = "checks in receive_param()" ] - WFReportParams -> Connected [ label = "in receive_param()" ] - WFReportParams -> WFBitMapS [ label = "sync_handshake()" ] - WFReportParams -> WFBitMapT [ label = "sync_handshake()" ] - WFBitMapS -> SyncSource [ label = "receive_bitmap()" ] - WFBitMapT -> SyncTarget [ label = "receive_bitmap()" ] - SyncSource -> Connected - SyncTarget -> Connected - SyncSource -> PausedSyncS - SyncTarget -> PausedSyncT - PausedSyncS -> SyncSource - PausedSyncT -> SyncTarget - Connected -> WFConnection [ label = "* on network error" ] -} diff --git a/Documentation/blockdev/drbd/data-structure-v9.rst b/Documentation/blockdev/drbd/data-structure-v9.rst deleted file mode 100644 index 66036b901644..000000000000 --- a/Documentation/blockdev/drbd/data-structure-v9.rst +++ /dev/null @@ -1,42 +0,0 @@ -================================ -kernel data structure for DRBD-9 -================================ - -This describes the in kernel data structure for DRBD-9. Starting with -Linux v3.14 we are reorganizing DRBD to use this data structure. - -Basic Data Structure -==================== - -A node has a number of DRBD resources. Each such resource has a number of -devices (aka volumes) and connections to other nodes ("peer nodes"). Each DRBD -device is represented by a block device locally. - -The DRBD objects are interconnected to form a matrix as depicted below; a -drbd_peer_device object sits at each intersection between a drbd_device and a -drbd_connection:: - - /--------------+---------------+.....+---------------\ - | resource | device | | device | - +--------------+---------------+.....+---------------+ - | connection | peer_device | | peer_device | - +--------------+---------------+.....+---------------+ - : : : : : - : : : : : - +--------------+---------------+.....+---------------+ - | connection | peer_device | | peer_device | - \--------------+---------------+.....+---------------/ - -In this table, horizontally, devices can be accessed from resources by their -volume number. Likewise, peer_devices can be accessed from connections by -their volume number. Objects in the vertical direction are connected by double -linked lists. There are back pointers from peer_devices to their connections a -devices, and from connections and devices to their resource. - -All resources are in the drbd_resources double-linked list. In addition, all -devices can be accessed by their minor device number via the drbd_devices idr. - -The drbd_resource, drbd_connection, and drbd_device objects are reference -counted. The peer_device objects only serve to establish the links between -devices and connections; their lifetime is determined by the lifetime of the -device and connection which they reference. diff --git a/Documentation/blockdev/drbd/disk-states-8.dot b/Documentation/blockdev/drbd/disk-states-8.dot deleted file mode 100644 index d06cfb46fb98..000000000000 --- a/Documentation/blockdev/drbd/disk-states-8.dot +++ /dev/null @@ -1,16 +0,0 @@ -digraph disk_states { - Diskless -> Inconsistent [ label = "ioctl_set_disk()" ] - Diskless -> Consistent [ label = "ioctl_set_disk()" ] - Diskless -> Outdated [ label = "ioctl_set_disk()" ] - Consistent -> Outdated [ label = "receive_param()" ] - Consistent -> UpToDate [ label = "receive_param()" ] - Consistent -> Inconsistent [ label = "start resync" ] - Outdated -> Inconsistent [ label = "start resync" ] - UpToDate -> Inconsistent [ label = "ioctl_replicate" ] - Inconsistent -> UpToDate [ label = "resync completed" ] - Consistent -> Failed [ label = "io completion error" ] - Outdated -> Failed [ label = "io completion error" ] - UpToDate -> Failed [ label = "io completion error" ] - Inconsistent -> Failed [ label = "io completion error" ] - Failed -> Diskless [ label = "sending notify to peer" ] -} diff --git a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot b/Documentation/blockdev/drbd/drbd-connection-state-overview.dot deleted file mode 100644 index 6d9cf0a7b11d..000000000000 --- a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot +++ /dev/null @@ -1,85 +0,0 @@ -// vim: set sw=2 sts=2 : -digraph { - rankdir=BT - bgcolor=white - - node [shape=plaintext] - node [fontcolor=black] - - StandAlone [ style=filled,fillcolor=gray,label=StandAlone ] - - node [fontcolor=lightgray] - - Unconnected [ label=Unconnected ] - - CommTrouble [ shape=record, - label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ] - - node [fontcolor=gray] - - subgraph cluster_try_connect { - label="try to connect, handshake" - rank=max - WFConnection [ label=WFConnection ] - WFReportParams [ label=WFReportParams ] - } - - TearDown [ label=TearDown ] - - Connected [ label=Connected,style=filled,fillcolor=green,fontcolor=black ] - - node [fontcolor=lightblue] - - StartingSyncS [ label=StartingSyncS ] - StartingSyncT [ label=StartingSyncT ] - - subgraph cluster_bitmap_exchange { - node [fontcolor=red] - fontcolor=red - label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged" - - WFBitMapT [ label=WFBitMapT ] - WFSyncUUID [ label=WFSyncUUID ] - WFBitMapS [ label=WFBitMapS ] - } - - node [fontcolor=blue] - - cluster_resync [ shape=record,label="{resynchronisation process running\l'concurrent' application requests allowed|{{PausedSyncT\nSyncTarget}|{PausedSyncS\nSyncSource}}}" ] - - node [shape=box,fontcolor=black] - - // drbdadm [label="drbdadm connect"] - // handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."] - // comm_error [label="communication trouble"] - - // - // edges - // -------------------------------------- - - StandAlone -> Unconnected [ label="drbdadm connect" ] - Unconnected -> StandAlone [ label="drbdadm disconnect\lor serious communication trouble" ] - Unconnected -> WFConnection [ label="receiver thread is started" ] - WFConnection -> WFReportParams [ headlabel="accept()\land/or \lconnect()\l" ] - - WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ] - WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ] - - WFReportParams -> WFBitMapS - WFReportParams -> WFBitMapT - WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false] - - WFBitMapS -> cluster_resync:S - WFSyncUUID -> cluster_resync:T - - edge [color=green] - cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ] - - edge [color=red] - WFReportParams -> CommTrouble - Connected -> CommTrouble - cluster_resync:any -> CommTrouble - edge [color=black] - CommTrouble -> Unconnected [label="receiver thread is stopped" ] - -} diff --git a/Documentation/blockdev/drbd/figures.rst b/Documentation/blockdev/drbd/figures.rst deleted file mode 100644 index 3e3fd4b8a478..000000000000 --- a/Documentation/blockdev/drbd/figures.rst +++ /dev/null @@ -1,28 +0,0 @@ -.. The here included files are intended to help understand the implementation - -Data flows that Relate some functions, and write packets -======================================================== - -.. kernel-figure:: DRBD-8.3-data-packets.svg - :alt: DRBD-8.3-data-packets.svg - :align: center - -.. kernel-figure:: DRBD-data-packets.svg - :alt: DRBD-data-packets.svg - :align: center - - -Sub graphs of DRBD's state transitions -====================================== - -.. kernel-figure:: conn-states-8.dot - :alt: conn-states-8.dot - :align: center - -.. kernel-figure:: disk-states-8.dot - :alt: disk-states-8.dot - :align: center - -.. kernel-figure:: node-states-8.dot - :alt: node-states-8.dot - :align: center diff --git a/Documentation/blockdev/drbd/index.rst b/Documentation/blockdev/drbd/index.rst deleted file mode 100644 index 68ecd5c113e9..000000000000 --- a/Documentation/blockdev/drbd/index.rst +++ /dev/null @@ -1,19 +0,0 @@ -========================================== -Distributed Replicated Block Device - DRBD -========================================== - -Description -=========== - - DRBD is a shared-nothing, synchronously replicated block device. It - is designed to serve as a building block for high availability - clusters and in this context, is a "drop-in" replacement for shared - storage. Simplistically, you could see it as a network RAID 1. - - Please visit http://www.drbd.org to find out more. - -.. toctree:: - :maxdepth: 1 - - data-structure-v9 - figures diff --git a/Documentation/blockdev/drbd/node-states-8.dot b/Documentation/blockdev/drbd/node-states-8.dot deleted file mode 100644 index 4a2b00c23547..000000000000 --- a/Documentation/blockdev/drbd/node-states-8.dot +++ /dev/null @@ -1,14 +0,0 @@ -digraph node_states { - Secondary -> Primary [ label = "ioctl_set_state()" ] - Primary -> Secondary [ label = "ioctl_set_state()" ] -} - -digraph peer_states { - Secondary -> Primary [ label = "recv state packet" ] - Primary -> Secondary [ label = "recv state packet" ] - Primary -> Unknown [ label = "connection lost" ] - Secondary -> Unknown [ label = "connection lost" ] - Unknown -> Primary [ label = "connected" ] - Unknown -> Secondary [ label = "connected" ] -} - diff --git a/Documentation/blockdev/floppy.rst b/Documentation/blockdev/floppy.rst deleted file mode 100644 index 4a8f31cf4139..000000000000 --- a/Documentation/blockdev/floppy.rst +++ /dev/null @@ -1,255 +0,0 @@ -============= -Floppy Driver -============= - -FAQ list: -========= - -A FAQ list may be found in the fdutils package (see below), and also -at . - - -LILO configuration options (Thinkpad users, read this) -====================================================== - -The floppy driver is configured using the 'floppy=' option in -lilo. This option can be typed at the boot prompt, or entered in the -lilo configuration file. - -Example: If your kernel is called linux-2.6.9, type the following line -at the lilo boot prompt (if you have a thinkpad):: - - linux-2.6.9 floppy=thinkpad - -You may also enter the following line in /etc/lilo.conf, in the description -of linux-2.6.9:: - - append = "floppy=thinkpad" - -Several floppy related options may be given, example:: - - linux-2.6.9 floppy=daring floppy=two_fdc - append = "floppy=daring floppy=two_fdc" - -If you give options both in the lilo config file and on the boot -prompt, the option strings of both places are concatenated, the boot -prompt options coming last. That's why there are also options to -restore the default behavior. - - -Module configuration options -============================ - -If you use the floppy driver as a module, use the following syntax:: - - modprobe floppy floppy="" - -Example:: - - modprobe floppy floppy="omnibook messages" - -If you need certain options enabled every time you load the floppy driver, -you can put:: - - options floppy floppy="omnibook messages" - -in a configuration file in /etc/modprobe.d/. - - -The floppy driver related options are: - - floppy=asus_pci - Sets the bit mask to allow only units 0 and 1. (default) - - floppy=daring - Tells the floppy driver that you have a well behaved floppy controller. - This allows more efficient and smoother operation, but may fail on - certain controllers. This may speed up certain operations. - - floppy=0,daring - Tells the floppy driver that your floppy controller should be used - with caution. - - floppy=one_fdc - Tells the floppy driver that you have only one floppy controller. - (default) - - floppy=two_fdc / floppy=
,two_fdc - Tells the floppy driver that you have two floppy controllers. - The second floppy controller is assumed to be at
. - This option is not needed if the second controller is at address - 0x370, and if you use the 'cmos' option. - - floppy=thinkpad - Tells the floppy driver that you have a Thinkpad. Thinkpads use an - inverted convention for the disk change line. - - floppy=0,thinkpad - Tells the floppy driver that you don't have a Thinkpad. - - floppy=omnibook / floppy=nodma - Tells the floppy driver not to use Dma for data transfers. - This is needed on HP Omnibooks, which don't have a workable - DMA channel for the floppy driver. This option is also useful - if you frequently get "Unable to allocate DMA memory" messages. - Indeed, dma memory needs to be continuous in physical memory, - and is thus harder to find, whereas non-dma buffers may be - allocated in virtual memory. However, I advise against this if - you have an FDC without a FIFO (8272A or 82072). 82072A and - later are OK. You also need at least a 486 to use nodma. - If you use nodma mode, I suggest you also set the FIFO - threshold to 10 or lower, in order to limit the number of data - transfer interrupts. - - If you have a FIFO-able FDC, the floppy driver automatically - falls back on non DMA mode if no DMA-able memory can be found. - If you want to avoid this, explicitly ask for 'yesdma'. - - floppy=yesdma - Tells the floppy driver that a workable DMA channel is available. - (default) - - floppy=nofifo - Disables the FIFO entirely. This is needed if you get "Bus - master arbitration error" messages from your Ethernet card (or - from other devices) while accessing the floppy. - - floppy=usefifo - Enables the FIFO. (default) - - floppy=,fifo_depth - Sets the FIFO threshold. This is mostly relevant in DMA - mode. If this is higher, the floppy driver tolerates more - interrupt latency, but it triggers more interrupts (i.e. it - imposes more load on the rest of the system). If this is - lower, the interrupt latency should be lower too (faster - processor). The benefit of a lower threshold is less - interrupts. - - To tune the fifo threshold, switch on over/underrun messages - using 'floppycontrol --messages'. Then access a floppy - disk. If you get a huge amount of "Over/Underrun - retrying" - messages, then the fifo threshold is too low. Try with a - higher value, until you only get an occasional Over/Underrun. - It is a good idea to compile the floppy driver as a module - when doing this tuning. Indeed, it allows to try different - fifo values without rebooting the machine for each test. Note - that you need to do 'floppycontrol --messages' every time you - re-insert the module. - - Usually, tuning the fifo threshold should not be needed, as - the default (0xa) is reasonable. - - floppy=,,cmos - Sets the CMOS type of to . This is mandatory if - you have more than two floppy drives (only two can be - described in the physical CMOS), or if your BIOS uses - non-standard CMOS types. The CMOS types are: - - == ================================== - 0 Use the value of the physical CMOS - 1 5 1/4 DD - 2 5 1/4 HD - 3 3 1/2 DD - 4 3 1/2 HD - 5 3 1/2 ED - 6 3 1/2 ED - 16 unknown or not installed - == ================================== - - (Note: there are two valid types for ED drives. This is because 5 was - initially chosen to represent floppy *tapes*, and 6 for ED drives. - AMI ignored this, and used 5 for ED drives. That's why the floppy - driver handles both.) - - floppy=unexpected_interrupts - Print a warning message when an unexpected interrupt is received. - (default) - - floppy=no_unexpected_interrupts / floppy=L40SX - Don't print a message when an unexpected interrupt is received. This - is needed on IBM L40SX laptops in certain video modes. (There seems - to be an interaction between video and floppy. The unexpected - interrupts affect only performance, and can be safely ignored.) - - floppy=broken_dcl - Don't use the disk change line, but assume that the disk was - changed whenever the device node is reopened. Needed on some - boxes where the disk change line is broken or unsupported. - This should be regarded as a stopgap measure, indeed it makes - floppy operation less efficient due to unneeded cache - flushings, and slightly more unreliable. Please verify your - cable, connection and jumper settings if you have any DCL - problems. However, some older drives, and also some laptops - are known not to have a DCL. - - floppy=debug - Print debugging messages. - - floppy=messages - Print informational messages for some operations (disk change - notifications, warnings about over and underruns, and about - autodetection). - - floppy=silent_dcl_clear - Uses a less noisy way to clear the disk change line (which - doesn't involve seeks). Implied by 'daring' option. - - floppy=,irq - Sets the floppy IRQ to instead of 6. - - floppy=,dma - Sets the floppy DMA channel to instead of 2. - - floppy=slow - Use PS/2 stepping rate:: - - PS/2 floppies have much slower step rates than regular floppies. - It's been recommended that take about 1/4 of the default speed - in some more extreme cases. - - -Supporting utilities and additional documentation: -================================================== - -Additional parameters of the floppy driver can be configured at -runtime. Utilities which do this can be found in the fdutils package. -This package also contains a new version of mtools which allows to -access high capacity disks (up to 1992K on a high density 3 1/2 disk!). -It also contains additional documentation about the floppy driver. - -The latest version can be found at fdutils homepage: - - http://fdutils.linux.lu - -The fdutils releases can be found at: - - http://fdutils.linux.lu/download.html - - http://www.tux.org/pub/knaff/fdutils/ - - ftp://metalab.unc.edu/pub/Linux/utils/disk-management/ - -Reporting problems about the floppy driver -========================================== - -If you have a question or a bug report about the floppy driver, mail -me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use -comp.os.linux.hardware. As the volume in these groups is rather high, -be sure to include the word "floppy" (or "FLOPPY") in the subject -line. If the reported problem happens when mounting floppy disks, be -sure to mention also the type of the filesystem in the subject line. - -Be sure to read the FAQ before mailing/posting any bug reports! - -Alain - -Changelog -========= - -10-30-2004 : - Cleanup, updating, add reference to module configuration. - James Nelson - -6-3-2000 : - Original Document diff --git a/Documentation/blockdev/index.rst b/Documentation/blockdev/index.rst deleted file mode 100644 index a9af6ed8b4aa..000000000000 --- a/Documentation/blockdev/index.rst +++ /dev/null @@ -1,16 +0,0 @@ -:orphan: - -=========================== -The Linux RapidIO Subsystem -=========================== - -.. toctree:: - :maxdepth: 1 - - floppy - nbd - paride - ramdisk - zram - - drbd/index diff --git a/Documentation/blockdev/nbd.rst b/Documentation/blockdev/nbd.rst deleted file mode 100644 index d78dfe559dcf..000000000000 --- a/Documentation/blockdev/nbd.rst +++ /dev/null @@ -1,31 +0,0 @@ -================================== -Network Block Device (TCP version) -================================== - -1) Overview ------------ - -What is it: With this compiled in the kernel (or as a module), Linux -can use a remote server as one of its block devices. So every time -the client computer wants to read, e.g., /dev/nb0, it sends a -request over TCP to the server, which will reply with the data read. -This can be used for stations with low disk space (or even diskless) -to borrow disk space from another computer. -Unlike NFS, it is possible to put any filesystem on it, etc. - -For more information, or to download the nbd-client and nbd-server -tools, go to http://nbd.sf.net/. - -The nbd kernel module need only be installed on the client -system, as the nbd-server is completely in userspace. In fact, -the nbd-server has been successfully ported to other operating -systems, including Windows. - -A) NBD parameters ------------------ - -max_part - Number of partitions per device (default: 0). - -nbds_max - Number of block devices that should be initialized (default: 16). diff --git a/Documentation/blockdev/paride.rst b/Documentation/blockdev/paride.rst deleted file mode 100644 index 87b4278bf314..000000000000 --- a/Documentation/blockdev/paride.rst +++ /dev/null @@ -1,439 +0,0 @@ -=================================== -Linux and parallel port IDE devices -=================================== - -PARIDE v1.03 (c) 1997-8 Grant Guenther - -1. Introduction -=============== - -Owing to the simplicity and near universality of the parallel port interface -to personal computers, many external devices such as portable hard-disk, -CD-ROM, LS-120 and tape drives use the parallel port to connect to their -host computer. While some devices (notably scanners) use ad-hoc methods -to pass commands and data through the parallel port interface, most -external devices are actually identical to an internal model, but with -a parallel-port adapter chip added in. Some of the original parallel port -adapters were little more than mechanisms for multiplexing a SCSI bus. -(The Iomega PPA-3 adapter used in the ZIP drives is an example of this -approach). Most current designs, however, take a different approach. -The adapter chip reproduces a small ISA or IDE bus in the external device -and the communication protocol provides operations for reading and writing -device registers, as well as data block transfer functions. Sometimes, -the device being addressed via the parallel cable is a standard SCSI -controller like an NCR 5380. The "ditto" family of external tape -drives use the ISA replicator to interface a floppy disk controller, -which is then connected to a floppy-tape mechanism. The vast majority -of external parallel port devices, however, are now based on standard -IDE type devices, which require no intermediate controller. If one -were to open up a parallel port CD-ROM drive, for instance, one would -find a standard ATAPI CD-ROM drive, a power supply, and a single adapter -that interconnected a standard PC parallel port cable and a standard -IDE cable. It is usually possible to exchange the CD-ROM device with -any other device using the IDE interface. - -The document describes the support in Linux for parallel port IDE -devices. It does not cover parallel port SCSI devices, "ditto" tape -drives or scanners. Many different devices are supported by the -parallel port IDE subsystem, including: - - - MicroSolutions backpack CD-ROM - - MicroSolutions backpack PD/CD - - MicroSolutions backpack hard-drives - - MicroSolutions backpack 8000t tape drive - - SyQuest EZ-135, EZ-230 & SparQ drives - - Avatar Shark - - Imation Superdisk LS-120 - - Maxell Superdisk LS-120 - - FreeCom Power CD - - Hewlett-Packard 5GB and 8GB tape drives - - Hewlett-Packard 7100 and 7200 CD-RW drives - -as well as most of the clone and no-name products on the market. - -To support such a wide range of devices, PARIDE, the parallel port IDE -subsystem, is actually structured in three parts. There is a base -paride module which provides a registry and some common methods for -accessing the parallel ports. The second component is a set of -high-level drivers for each of the different types of supported devices: - - === ============= - pd IDE disk - pcd ATAPI CD-ROM - pf ATAPI disk - pt ATAPI tape - pg ATAPI generic - === ============= - -(Currently, the pg driver is only used with CD-R drives). - -The high-level drivers function according to the relevant standards. -The third component of PARIDE is a set of low-level protocol drivers -for each of the parallel port IDE adapter chips. Thanks to the interest -and encouragement of Linux users from many parts of the world, -support is available for almost all known adapter protocols: - - ==== ====================================== ==== - aten ATEN EH-100 (HK) - bpck Microsolutions backpack (US) - comm DataStor (old-type) "commuter" adapter (TW) - dstr DataStor EP-2000 (TW) - epat Shuttle EPAT (UK) - epia Shuttle EPIA (UK) - fit2 FIT TD-2000 (US) - fit3 FIT TD-3000 (US) - friq Freecom IQ cable (DE) - frpw Freecom Power (DE) - kbic KingByte KBIC-951A and KBIC-971A (TW) - ktti KT Technology PHd adapter (SG) - on20 OnSpec 90c20 (US) - on26 OnSpec 90c26 (US) - ==== ====================================== ==== - - -2. Using the PARIDE subsystem -============================= - -While configuring the Linux kernel, you may choose either to build -the PARIDE drivers into your kernel, or to build them as modules. - -In either case, you will need to select "Parallel port IDE device support" -as well as at least one of the high-level drivers and at least one -of the parallel port communication protocols. If you do not know -what kind of parallel port adapter is used in your drive, you could -begin by checking the file names and any text files on your DOS -installation floppy. Alternatively, you can look at the markings on -the adapter chip itself. That's usually sufficient to identify the -correct device. - -You can actually select all the protocol modules, and allow the PARIDE -subsystem to try them all for you. - -For the "brand-name" products listed above, here are the protocol -and high-level drivers that you would use: - - ================ ============ ====== ======== - Manufacturer Model Driver Protocol - ================ ============ ====== ======== - MicroSolutions CD-ROM pcd bpck - MicroSolutions PD drive pf bpck - MicroSolutions hard-drive pd bpck - MicroSolutions 8000t tape pt bpck - SyQuest EZ, SparQ pd epat - Imation Superdisk pf epat - Maxell Superdisk pf friq - Avatar Shark pd epat - FreeCom CD-ROM pcd frpw - Hewlett-Packard 5GB Tape pt epat - Hewlett-Packard 7200e (CD) pcd epat - Hewlett-Packard 7200e (CD-R) pg epat - ================ ============ ====== ======== - -2.1 Configuring built-in drivers ---------------------------------- - -We recommend that you get to know how the drivers work and how to -configure them as loadable modules, before attempting to compile a -kernel with the drivers built-in. - -If you built all of your PARIDE support directly into your kernel, -and you have just a single parallel port IDE device, your kernel should -locate it automatically for you. If you have more than one device, -you may need to give some command line options to your bootloader -(eg: LILO), how to do that is beyond the scope of this document. - -The high-level drivers accept a number of command line parameters, all -of which are documented in the source files in linux/drivers/block/paride. -By default, each driver will automatically try all parallel ports it -can find, and all protocol types that have been installed, until it finds -a parallel port IDE adapter. Once it finds one, the probe stops. So, -if you have more than one device, you will need to tell the drivers -how to identify them. This requires specifying the port address, the -protocol identification number and, for some devices, the drive's -chain ID. While your system is booting, a number of messages are -displayed on the console. Like all such messages, they can be -reviewed with the 'dmesg' command. Among those messages will be -some lines like:: - - paride: bpck registered as protocol 0 - paride: epat registered as protocol 1 - -The numbers will always be the same until you build a new kernel with -different protocol selections. You should note these numbers as you -will need them to identify the devices. - -If you happen to be using a MicroSolutions backpack device, you will -also need to know the unit ID number for each drive. This is usually -the last two digits of the drive's serial number (but read MicroSolutions' -documentation about this). - -As an example, let's assume that you have a MicroSolutions PD/CD drive -with unit ID number 36 connected to the parallel port at 0x378, a SyQuest -EZ-135 connected to the chained port on the PD/CD drive and also an -Imation Superdisk connected to port 0x278. You could give the following -options on your boot command:: - - pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36 - -In the last option, pf.drive1 configures device /dev/pf1, the 0x378 -is the parallel port base address, the 0 is the protocol registration -number and 36 is the chain ID. - -Please note: while PARIDE will work both with and without the -PARPORT parallel port sharing system that is included by the -"Parallel port support" option, PARPORT must be included and enabled -if you want to use chains of devices on the same parallel port. - -2.2 Loading and configuring PARIDE as modules ----------------------------------------------- - -It is much faster and simpler to get to understand the PARIDE drivers -if you use them as loadable kernel modules. - -Note 1: - using these drivers with the "kerneld" automatic module loading - system is not recommended for beginners, and is not documented here. - -Note 2: - if you build PARPORT support as a loadable module, PARIDE must - also be built as loadable modules, and PARPORT must be loaded before - the PARIDE modules. - -To use PARIDE, you must begin by:: - - insmod paride - -this loads a base module which provides a registry for the protocols, -among other tasks. - -Then, load as many of the protocol modules as you think you might need. -As you load each module, it will register the protocols that it supports, -and print a log message to your kernel log file and your console. For -example:: - - # insmod epat - paride: epat registered as protocol 0 - # insmod kbic - paride: k951 registered as protocol 1 - paride: k971 registered as protocol 2 - -Finally, you can load high-level drivers for each kind of device that -you have connected. By default, each driver will autoprobe for a single -device, but you can support up to four similar devices by giving their -individual co-ordinates when you load the driver. - -For example, if you had two no-name CD-ROM drives both using the -KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc -you could give the following command:: - - # insmod pcd drive0=0x378,1 drive1=0x3bc,1 - -For most adapters, giving a port address and protocol number is sufficient, -but check the source files in linux/drivers/block/paride for more -information. (Hopefully someone will write some man pages one day !). - -As another example, here's what happens when PARPORT is installed, and -a SyQuest EZ-135 is attached to port 0x378:: - - # insmod paride - paride: version 1.0 installed - # insmod epat - paride: epat registered as protocol 0 - # insmod pd - pd: pd version 1.0, major 45, cluster 64, nice 0 - pda: Sharing parport1 at 0x378 - pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1 - pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media - pda: pda1 - -Note that the last line is the output from the generic partition table -scanner - in this case it reports that it has found a disk with one partition. - -2.3 Using a PARIDE device --------------------------- - -Once the drivers have been loaded, you can access PARIDE devices in the -same way as their traditional counterparts. You will probably need to -create the device "special files". Here is a simple script that you can -cut to a file and execute:: - - #!/bin/bash - # - # mkd -- a script to create the device special files for the PARIDE subsystem - # - function mkdev { - mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1 - } - # - function pd { - D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) ) - mkdev pd$D b 45 $[ $1 * 16 ] - for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - do mkdev pd$D$P b 45 $[ $1 * 16 + $P ] - done - } - # - cd /dev - # - for u in 0 1 2 3 ; do pd $u ; done - for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done - for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done - for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done - for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done - for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done - # - # end of mkd - -With the device files and drivers in place, you can access PARIDE devices -like any other Linux device. For example, to mount a CD-ROM in pcd0, use:: - - mount /dev/pcd0 /cdrom - -If you have a fresh Avatar Shark cartridge, and the drive is pda, you -might do something like:: - - fdisk /dev/pda -- make a new partition table with - partition 1 of type 83 - - mke2fs /dev/pda1 -- to build the file system - - mkdir /shark -- make a place to mount the disk - - mount /dev/pda1 /shark - -Devices like the Imation superdisk work in the same way, except that -they do not have a partition table. For example to make a 120MB -floppy that you could share with a DOS system:: - - mkdosfs /dev/pf0 - mount /dev/pf0 /mnt - - -2.4 The pf driver ------------------- - -The pf driver is intended for use with parallel port ATAPI disk -devices. The most common devices in this category are PD drives -and LS-120 drives. Traditionally, media for these devices are not -partitioned. Consequently, the pf driver does not support partitioned -media. This may be changed in a future version of the driver. - -2.5 Using the pt driver ------------------------- - -The pt driver for parallel port ATAPI tape drives is a minimal driver. -It does not yet support many of the standard tape ioctl operations. -For best performance, a block size of 32KB should be used. You will -probably want to set the parallel port delay to 0, if you can. - -2.6 Using the pg driver ------------------------- - -The pg driver can be used in conjunction with the cdrecord program -to create CD-ROMs. Please get cdrecord version 1.6.1 or later -from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media -your parallel port should ideally be set to EPP mode, and the "port delay" -should be set to 0. With those settings it is possible to record at 2x -speed without any buffer underruns. If you cannot get the driver to work -in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only. - - -3. Troubleshooting -================== - -3.1 Use EPP mode if you can ----------------------------- - -The most common problems that people report with the PARIDE drivers -concern the parallel port CMOS settings. At this time, none of the -PARIDE protocol modules support ECP mode, or any ECP combination modes. -If you are able to do so, please set your parallel port into EPP mode -using your CMOS setup procedure. - -3.2 Check the port delay -------------------------- - -Some parallel ports cannot reliably transfer data at full speed. To -offset the errors, the PARIDE protocol modules introduce a "port -delay" between each access to the i/o ports. Each protocol sets -a default value for this delay. In most cases, the user can override -the default and set it to 0 - resulting in somewhat higher transfer -rates. In some rare cases (especially with older 486 systems) the -default delays are not long enough. if you experience corrupt data -transfers, or unexpected failures, you may wish to increase the -port delay. The delay can be programmed using the "driveN" parameters -to each of the high-level drivers. Please see the notes above, or -read the comments at the beginning of the driver source files in -linux/drivers/block/paride. - -3.3 Some drives need a printer reset -------------------------------------- - -There appear to be a number of "noname" external drives on the market -that do not always power up correctly. We have noticed this with some -drives based on OnSpec and older Freecom adapters. In these rare cases, -the adapter can often be reinitialised by issuing a "printer reset" on -the parallel port. As the reset operation is potentially disruptive in -multiple device environments, the PARIDE drivers will not do it -automatically. You can however, force a printer reset by doing:: - - insmod lp reset=1 - rmmod lp - -If you have one of these marginal cases, you should probably build -your paride drivers as modules, and arrange to do the printer reset -before loading the PARIDE drivers. - -3.4 Use the verbose option and dmesg if you need help ------------------------------------------------------- - -While a lot of testing has gone into these drivers to make them work -as smoothly as possible, problems will arise. If you do have problems, -please check all the obvious things first: does the drive work in -DOS with the manufacturer's drivers ? If that doesn't yield any useful -clues, then please make sure that only one drive is hooked to your system, -and that either (a) PARPORT is enabled or (b) no other device driver -is using your parallel port (check in /proc/ioports). Then, load the -appropriate drivers (you can load several protocol modules if you want) -as in:: - - # insmod paride - # insmod epat - # insmod bpck - # insmod kbic - ... - # insmod pd verbose=1 - -(using the correct driver for the type of device you have, of course). -The verbose=1 parameter will cause the drivers to log a trace of their -activity as they attempt to locate your drive. - -Use 'dmesg' to capture a log of all the PARIDE messages (any messages -beginning with paride:, a protocol module's name or a driver's name) and -include that with your bug report. You can submit a bug report in one -of two ways. Either send it directly to the author of the PARIDE suite, -by e-mail to grant@torque.net, or join the linux-parport mailing list -and post your report there. - -3.5 For more information or help ---------------------------------- - -You can join the linux-parport mailing list by sending a mail message -to: - - linux-parport-request@torque.net - -with the single word:: - - subscribe - -in the body of the mail message (not in the subject line). Please be -sure that your mail program is correctly set up when you do this, as -the list manager is a robot that will subscribe you using the reply -address in your mail headers. REMOVE any anti-spam gimmicks you may -have in your mail headers, when sending mail to the list server. - -You might also find some useful information on the linux-parport -web pages (although they are not always up to date) at - - http://web.archive.org/web/%2E/http://www.torque.net/parport/ diff --git a/Documentation/blockdev/ramdisk.rst b/Documentation/blockdev/ramdisk.rst deleted file mode 100644 index b7c2268f8dec..000000000000 --- a/Documentation/blockdev/ramdisk.rst +++ /dev/null @@ -1,177 +0,0 @@ -========================================== -Using the RAM disk block device with Linux -========================================== - -.. Contents: - - 1) Overview - 2) Kernel Command Line Parameters - 3) Using "rdev -r" - 4) An Example of Creating a Compressed RAM Disk - - -1) Overview ------------ - -The RAM disk driver is a way to use main system memory as a block device. It -is required for initrd, an initial filesystem used if you need to load modules -in order to access the root filesystem (see Documentation/admin-guide/initrd.rst). It can -also be used for a temporary filesystem for crypto work, since the contents -are erased on reboot. - -The RAM disk dynamically grows as more space is required. It does this by using -RAM from the buffer cache. The driver marks the buffers it is using as dirty -so that the VM subsystem does not try to reclaim them later. - -The RAM disk supports up to 16 RAM disks by default, and can be reconfigured -to support an unlimited number of RAM disks (at your own risk). Just change -the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu -and (re)build the kernel. - -To use RAM disk support with your system, run './MAKEDEV ram' from the /dev -directory. RAM disks are all major number 1, and start with minor number 0 -for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd. - -The new RAM disk also has the ability to load compressed RAM disk images, -allowing one to squeeze more programs onto an average installation or -rescue floppy disk. - - -2) Parameters ---------------------------------- - -2a) Kernel Command Line Parameters - - ramdisk_size=N - Size of the ramdisk. - -This parameter tells the RAM disk driver to set up RAM disks of N k size. The -default is 4096 (4 MB). - -2b) Module parameters - - rd_nr - /dev/ramX devices created. - - max_part - Maximum partition number. - - rd_size - See ramdisk_size. - -3) Using "rdev -r" ------------------- - -The usage of the word (two bytes) that "rdev -r" sets in the kernel image is -as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up -to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit -14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a -prompt/wait sequence is to be given before trying to read the RAM disk. Since -the RAM disk dynamically grows as data is being written into it, a size field -is not required. Bits 11 to 13 are not currently used and may as well be zero. -These numbers are no magical secrets, as seen below:: - - ./arch/x86/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK 0x07FF - ./arch/x86/kernel/setup.c:#define RAMDISK_PROMPT_FLAG 0x8000 - ./arch/x86/kernel/setup.c:#define RAMDISK_LOAD_FLAG 0x4000 - -Consider a typical two floppy disk setup, where you will have the -kernel on disk one, and have already put a RAM disk image onto disk #2. - -Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk -starts at an offset of 0 kB from the beginning of the floppy. -The command line equivalent is: "ramdisk_start=0" - -You want bit 14 as one, indicating that a RAM disk is to be loaded. -The command line equivalent is: "load_ramdisk=1" - -You want bit 15 as one, indicating that you want a prompt/keypress -sequence so that you have a chance to switch floppy disks. -The command line equivalent is: "prompt_ramdisk=1" - -Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word. -So to create disk one of the set, you would do:: - - /usr/src/linux# cat arch/x86/boot/zImage > /dev/fd0 - /usr/src/linux# rdev /dev/fd0 /dev/fd0 - /usr/src/linux# rdev -r /dev/fd0 49152 - -If you make a boot disk that has LILO, then for the above, you would use:: - - append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1" - -Since the default start = 0 and the default prompt = 1, you could use:: - - append = "load_ramdisk=1" - - -4) An Example of Creating a Compressed RAM Disk ------------------------------------------------ - -To create a RAM disk image, you will need a spare block device to -construct it on. This can be the RAM disk device itself, or an -unused disk partition (such as an unmounted swap partition). For this -example, we will use the RAM disk device, "/dev/ram0". - -Note: This technique should not be done on a machine with less than 8 MB -of RAM. If using a spare disk partition instead of /dev/ram0, then this -restriction does not apply. - -a) Decide on the RAM disk size that you want. Say 2 MB for this example. - Create it by writing to the RAM disk device. (This step is not currently - required, but may be in the future.) It is wise to zero out the - area (esp. for disks) so that maximal compression is achieved for - the unused blocks of the image that you are about to create:: - - dd if=/dev/zero of=/dev/ram0 bs=1k count=2048 - -b) Make a filesystem on it. Say ext2fs for this example:: - - mke2fs -vm0 /dev/ram0 2048 - -c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...) - and unmount it again. - -d) Compress the contents of the RAM disk. The level of compression - will be approximately 50% of the space used by the files. Unused - space on the RAM disk will compress to almost nothing:: - - dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz - -e) Put the kernel onto the floppy:: - - dd if=zImage of=/dev/fd0 bs=1k - -f) Put the RAM disk image onto the floppy, after the kernel. Use an offset - that is slightly larger than the kernel, so that you can put another - (possibly larger) kernel onto the same floppy later without overlapping - the RAM disk image. An offset of 400 kB for kernels about 350 kB in - size would be reasonable. Make sure offset+size of ram_image.gz is - not larger than the total space on your floppy (usually 1440 kB):: - - dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400 - -g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc. - For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would - have 2^15 + 2^14 + 400 = 49552:: - - rdev /dev/fd0 /dev/fd0 - rdev -r /dev/fd0 49552 - -That is it. You now have your boot/root compressed RAM disk floppy. Some -users may wish to combine steps (d) and (f) by using a pipe. - - - Paul Gortmaker 12/95 - -Changelog: ----------- - -10-22-04 : - Updated to reflect changes in command line options, remove - obsolete references, general cleanup. - James Nelson (james4765@gmail.com) - - -12-95 : - Original Document diff --git a/Documentation/blockdev/zram.rst b/Documentation/blockdev/zram.rst deleted file mode 100644 index 6eccf13219ff..000000000000 --- a/Documentation/blockdev/zram.rst +++ /dev/null @@ -1,422 +0,0 @@ -======================================== -zram: Compressed RAM based block devices -======================================== - -Introduction -============ - -The zram module creates RAM based block devices named /dev/zram -( = 0, 1, ...). Pages written to these disks are compressed and stored -in memory itself. These disks allow very fast I/O and compression provides -good amounts of memory savings. Some of the usecases include /tmp storage, -use as swap disks, various caches under /var and maybe many more :) - -Statistics for individual zram devices are exported through sysfs nodes at -/sys/block/zram/ - -Usage -===== - -There are several ways to configure and manage zram device(-s): - -a) using zram and zram_control sysfs attributes -b) using zramctl utility, provided by util-linux (util-linux@vger.kernel.org). - -In this document we will describe only 'manual' zram configuration steps, -IOW, zram and zram_control sysfs attributes. - -In order to get a better idea about zramctl please consult util-linux -documentation, zramctl man-page or `zramctl --help`. Please be informed -that zram maintainers do not develop/maintain util-linux or zramctl, should -you have any questions please contact util-linux@vger.kernel.org - -Following shows a typical sequence of steps for using zram. - -WARNING -======= - -For the sake of simplicity we skip error checking parts in most of the -examples below. However, it is your sole responsibility to handle errors. - -zram sysfs attributes always return negative values in case of errors. -The list of possible return codes: - -======== ============================================================= --EBUSY an attempt to modify an attribute that cannot be changed once - the device has been initialised. Please reset device first; --ENOMEM zram was not able to allocate enough memory to fulfil your - needs; --EINVAL invalid input has been provided. -======== ============================================================= - -If you use 'echo', the returned value that is changed by 'echo' utility, -and, in general case, something like:: - - echo 3 > /sys/block/zram0/max_comp_streams - if [ $? -ne 0 ]; - handle_error - fi - -should suffice. - -1) Load Module -============== - -:: - - modprobe zram num_devices=4 - This creates 4 devices: /dev/zram{0,1,2,3} - -num_devices parameter is optional and tells zram how many devices should be -pre-created. Default: 1. - -2) Set max number of compression streams -======================================== - -Regardless the value passed to this attribute, ZRAM will always -allocate multiple compression streams - one per online CPUs - thus -allowing several concurrent compression operations. The number of -allocated compression streams goes down when some of the CPUs -become offline. There is no single-compression-stream mode anymore, -unless you are running a UP system or has only 1 CPU online. - -To find out how many streams are currently available:: - - cat /sys/block/zram0/max_comp_streams - -3) Select compression algorithm -=============================== - -Using comp_algorithm device attribute one can see available and -currently selected (shown in square brackets) compression algorithms, -change selected compression algorithm (once the device is initialised -there is no way to change compression algorithm). - -Examples:: - - #show supported compression algorithms - cat /sys/block/zram0/comp_algorithm - lzo [lz4] - - #select lzo compression algorithm - echo lzo > /sys/block/zram0/comp_algorithm - -For the time being, the `comp_algorithm` content does not necessarily -show every compression algorithm supported by the kernel. We keep this -list primarily to simplify device configuration and one can configure -a new device with a compression algorithm that is not listed in -`comp_algorithm`. The thing is that, internally, ZRAM uses Crypto API -and, if some of the algorithms were built as modules, it's impossible -to list all of them using, for instance, /proc/crypto or any other -method. This, however, has an advantage of permitting the usage of -custom crypto compression modules (implementing S/W or H/W compression). - -4) Set Disksize -=============== - -Set disk size by writing the value to sysfs node 'disksize'. -The value can be either in bytes or you can use mem suffixes. -Examples:: - - # Initialize /dev/zram0 with 50MB disksize - echo $((50*1024*1024)) > /sys/block/zram0/disksize - - # Using mem suffixes - echo 256K > /sys/block/zram0/disksize - echo 512M > /sys/block/zram0/disksize - echo 1G > /sys/block/zram0/disksize - -Note: -There is little point creating a zram of greater than twice the size of memory -since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the -size of the disk when not in use so a huge zram is wasteful. - -5) Set memory limit: Optional -============================= - -Set memory limit by writing the value to sysfs node 'mem_limit'. -The value can be either in bytes or you can use mem suffixes. -In addition, you could change the value in runtime. -Examples:: - - # limit /dev/zram0 with 50MB memory - echo $((50*1024*1024)) > /sys/block/zram0/mem_limit - - # Using mem suffixes - echo 256K > /sys/block/zram0/mem_limit - echo 512M > /sys/block/zram0/mem_limit - echo 1G > /sys/block/zram0/mem_limit - - # To disable memory limit - echo 0 > /sys/block/zram0/mem_limit - -6) Activate -=========== - -:: - - mkswap /dev/zram0 - swapon /dev/zram0 - - mkfs.ext4 /dev/zram1 - mount /dev/zram1 /tmp - -7) Add/remove zram devices -========================== - -zram provides a control interface, which enables dynamic (on-demand) device -addition and removal. - -In order to add a new /dev/zramX device, perform read operation on hot_add -attribute. This will return either new device's device id (meaning that you -can use /dev/zram) or error code. - -Example:: - - cat /sys/class/zram-control/hot_add - 1 - -To remove the existing /dev/zramX device (where X is a device id) -execute:: - - echo X > /sys/class/zram-control/hot_remove - -8) Stats -======== - -Per-device statistics are exported as various nodes under /sys/block/zram/ - -A brief description of exported device attributes. For more details please -read Documentation/ABI/testing/sysfs-block-zram. - -====================== ====== =============================================== -Name access description -====================== ====== =============================================== -disksize RW show and set the device's disk size -initstate RO shows the initialization state of the device -reset WO trigger device reset -mem_used_max WO reset the `mem_used_max` counter (see later) -mem_limit WO specifies the maximum amount of memory ZRAM can - use to store the compressed data -writeback_limit WO specifies the maximum amount of write IO zram - can write out to backing device as 4KB unit -writeback_limit_enable RW show and set writeback_limit feature -max_comp_streams RW the number of possible concurrent compress - operations -comp_algorithm RW show and change the compression algorithm -compact WO trigger memory compaction -debug_stat RO this file is used for zram debugging purposes -backing_dev RW set up backend storage for zram to write out -idle WO mark allocated slot as idle -====================== ====== =============================================== - - -User space is advised to use the following files to read the device statistics. - -File /sys/block/zram/stat - -Represents block layer statistics. Read Documentation/block/stat.rst for -details. - -File /sys/block/zram/io_stat - -The stat file represents device's I/O statistics not accounted by block -layer and, thus, not available in zram/stat file. It consists of a -single line of text and contains the following stats separated by -whitespace: - - ============= ============================================================= - failed_reads The number of failed reads - failed_writes The number of failed writes - invalid_io The number of non-page-size-aligned I/O requests - notify_free Depending on device usage scenario it may account - - a) the number of pages freed because of swap slot free - notifications - b) the number of pages freed because of - REQ_OP_DISCARD requests sent by bio. The former ones are - sent to a swap block device when a swap slot is freed, - which implies that this disk is being used as a swap disk. - - The latter ones are sent by filesystem mounted with - discard option, whenever some data blocks are getting - discarded. - ============= ============================================================= - -File /sys/block/zram/mm_stat - -The stat file represents device's mm statistics. It consists of a single -line of text and contains the following stats separated by whitespace: - - ================ ============================================================= - orig_data_size uncompressed size of data stored in this disk. - This excludes same-element-filled pages (same_pages) since - no memory is allocated for them. - Unit: bytes - compr_data_size compressed size of data stored in this disk - mem_used_total the amount of memory allocated for this disk. This - includes allocator fragmentation and metadata overhead, - allocated for this disk. So, allocator space efficiency - can be calculated using compr_data_size and this statistic. - Unit: bytes - mem_limit the maximum amount of memory ZRAM can use to store - the compressed data - mem_used_max the maximum amount of memory zram have consumed to - store the data - same_pages the number of same element filled pages written to this disk. - No memory is allocated for such pages. - pages_compacted the number of pages freed during compaction - huge_pages the number of incompressible pages - ================ ============================================================= - -File /sys/block/zram/bd_stat - -The stat file represents device's backing device statistics. It consists of -a single line of text and contains the following stats separated by whitespace: - - ============== ============================================================= - bd_count size of data written in backing device. - Unit: 4K bytes - bd_reads the number of reads from backing device - Unit: 4K bytes - bd_writes the number of writes to backing device - Unit: 4K bytes - ============== ============================================================= - -9) Deactivate -============= - -:: - - swapoff /dev/zram0 - umount /dev/zram1 - -10) Reset -========= - - Write any positive value to 'reset' sysfs node:: - - echo 1 > /sys/block/zram0/reset - echo 1 > /sys/block/zram1/reset - - This frees all the memory allocated for the given device and - resets the disksize to zero. You must set the disksize again - before reusing the device. - -Optional Feature -================ - -writeback ---------- - -With CONFIG_ZRAM_WRITEBACK, zram can write idle/incompressible page -to backing storage rather than keeping it in memory. -To use the feature, admin should set up backing device via:: - - echo /dev/sda5 > /sys/block/zramX/backing_dev - -before disksize setting. It supports only partition at this moment. -If admin want to use incompressible page writeback, they could do via:: - - echo huge > /sys/block/zramX/write - -To use idle page writeback, first, user need to declare zram pages -as idle:: - - echo all > /sys/block/zramX/idle - -From now on, any pages on zram are idle pages. The idle mark -will be removed until someone request access of the block. -IOW, unless there is access request, those pages are still idle pages. - -Admin can request writeback of those idle pages at right timing via:: - - echo idle > /sys/block/zramX/writeback - -With the command, zram writeback idle pages from memory to the storage. - -If there are lots of write IO with flash device, potentially, it has -flash wearout problem so that admin needs to design write limitation -to guarantee storage health for entire product life. - -To overcome the concern, zram supports "writeback_limit" feature. -The "writeback_limit_enable"'s default value is 0 so that it doesn't limit -any writeback. IOW, if admin want to apply writeback budget, he should -enable writeback_limit_enable via:: - - $ echo 1 > /sys/block/zramX/writeback_limit_enable - -Once writeback_limit_enable is set, zram doesn't allow any writeback -until admin set the budget via /sys/block/zramX/writeback_limit. - -(If admin doesn't enable writeback_limit_enable, writeback_limit's value -assigned via /sys/block/zramX/writeback_limit is meaninless.) - -If admin want to limit writeback as per-day 400M, he could do it -like below:: - - $ MB_SHIFT=20 - $ 4K_SHIFT=12 - $ echo $((400<>4K_SHIFT)) > \ - /sys/block/zram0/writeback_limit. - $ echo 1 > /sys/block/zram0/writeback_limit_enable - -If admin want to allow further write again once the bugdet is exausted, -he could do it like below:: - - $ echo $((400<>4K_SHIFT)) > \ - /sys/block/zram0/writeback_limit - -If admin want to see remaining writeback budget since he set:: - - $ cat /sys/block/zramX/writeback_limit - -If admin want to disable writeback limit, he could do:: - - $ echo 0 > /sys/block/zramX/writeback_limit_enable - -The writeback_limit count will reset whenever you reset zram(e.g., -system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of -writeback happened until you reset the zram to allocate extra writeback -budget in next setting is user's job. - -If admin want to measure writeback count in a certain period, he could -know it via /sys/block/zram0/bd_stat's 3rd column. - -memory tracking -=============== - -With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the -zram block. It could be useful to catch cold or incompressible -pages of the process with*pagemap. - -If you enable the feature, you could see block state via -/sys/kernel/debug/zram/zram0/block_state". The output is as follows:: - - 300 75.033841 .wh. - 301 63.806904 s... - 302 63.806919 ..hi - -First column - zram's block index. -Second column - access time since the system was booted -Third column - state of the block: - - s: - same page - w: - written page to backing store - h: - huge page - i: - idle page - -First line of above example says 300th block is accessed at 75.033841sec -and the block's state is huge so it is written back to the backing -storage. It's a debugging feature so anyone shouldn't rely on it to work -properly. - -Nitin Gupta -ngupta@vflare.org diff --git a/MAINTAINERS b/MAINTAINERS index b36028f43192..699596d931c1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5006,7 +5006,7 @@ T: git git://git.linbit.com/drbd-8.4.git S: Supported F: drivers/block/drbd/ F: lib/lru_cache.c -F: Documentation/blockdev/drbd/ +F: Documentation/admin-guide/blockdev/ DRIVER CORE, KOBJECTS, DEBUGFS AND SYSFS M: Greg Kroah-Hartman @@ -11076,7 +11076,7 @@ M: Josef Bacik S: Maintained L: linux-block@vger.kernel.org L: nbd@other.debian.org -F: Documentation/blockdev/nbd.rst +F: Documentation/admin-guide/blockdev/nbd.rst F: drivers/block/nbd.c F: include/trace/events/nbd.h F: include/uapi/linux/nbd.h @@ -12086,7 +12086,7 @@ PARIDE DRIVERS FOR PARALLEL PORT IDE DEVICES M: Tim Waugh L: linux-parport@lists.infradead.org (subscribers-only) S: Maintained -F: Documentation/blockdev/paride.rst +F: Documentation/admin-guide/blockdev/paride.rst F: drivers/block/paride/ PARISC ARCHITECTURE @@ -13367,7 +13367,7 @@ F: drivers/net/wireless/ralink/rt2x00/ RAMDISK RAM BLOCK DEVICE DRIVER M: Jens Axboe S: Maintained -F: Documentation/blockdev/ramdisk.rst +F: Documentation/admin-guide/blockdev/ramdisk.rst F: drivers/block/brd.c RANCHU VIRTUAL BOARD FOR MIPS @@ -17723,7 +17723,7 @@ R: Sergey Senozhatsky L: linux-kernel@vger.kernel.org S: Maintained F: drivers/block/zram/ -F: Documentation/blockdev/zram.rst +F: Documentation/admin-guide/blockdev/zram.rst ZS DECSTATION Z85C30 SERIAL DRIVER M: "Maciej W. Rozycki" diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index c43690b973d8..1bb8ec575352 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -31,7 +31,7 @@ config BLK_DEV_FD If you want to use the floppy disk drive(s) of your PC under Linux, say Y. Information about this driver, especially important for IBM Thinkpad users, is contained in - . + . That file also contains the location of the Floppy driver FAQ as well as location of the fdutils package used to configure additional parameters of the driver at run time. @@ -96,7 +96,7 @@ config PARIDE your computer's parallel port. Most of them are actually IDE devices using a parallel port IDE adapter. This option enables the PARIDE subsystem which contains drivers for many of these external drives. - Read for more information. + Read for more information. If you have said Y to the "Parallel-port support" configuration option, you may share a single port between your printer and other @@ -261,7 +261,7 @@ config BLK_DEV_NBD userland (making server and client physically the same computer, communicating using the loopback network device). - Read for more information, + Read for more information, especially about where to find the server code, which runs in user space and does not need special kernel support. @@ -303,7 +303,7 @@ config BLK_DEV_RAM during the initial install of Linux. Note that the kernel command line option "ramdisk=XX" is now obsolete. - For details, read . + For details, read . To compile this driver as a module, choose M here: the module will be called brd. An alias "rd" has been defined diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 5c99e52f9dc1..f652c1ac3ae9 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4424,7 +4424,7 @@ static int __init floppy_setup(char *str) pr_cont("\n"); } else DPRINT("botched floppy option\n"); - DPRINT("Read Documentation/blockdev/floppy.rst\n"); + DPRINT("Read Documentation/admin-guide/blockdev/floppy.rst\n"); return 0; } diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index e06b99d54816..fe7a4b7d30cf 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -12,7 +12,7 @@ config ZRAM It has several use cases, for example: /tmp storage, use as swap disks and maybe many more. - See Documentation/blockdev/zram.rst for more information. + See Documentation/admin-guide/blockdev/zram.rst for more information. config ZRAM_WRITEBACK bool "Write back incompressible or idle page to backing device" @@ -26,7 +26,7 @@ config ZRAM_WRITEBACK With /sys/block/zramX/{idle,writeback}, application could ask idle page's writeback to the backing device to save in memory. - See Documentation/blockdev/zram.rst for more information. + See Documentation/admin-guide/blockdev/zram.rst for more information. config ZRAM_MEMORY_TRACKING bool "Track zRam block status" @@ -36,4 +36,4 @@ config ZRAM_MEMORY_TRACKING of zRAM. Admin could see the information via /sys/kernel/debug/zram/zramX/block_state. - See Documentation/blockdev/zram.rst for more information. + See Documentation/admin-guide/blockdev/zram.rst for more information. diff --git a/tools/testing/selftests/zram/README b/tools/testing/selftests/zram/README index 5fa378391d3b..110b34834a6f 100644 --- a/tools/testing/selftests/zram/README +++ b/tools/testing/selftests/zram/README @@ -37,4 +37,4 @@ Commands required for testing: - mkfs/ mkfs.ext4 For more information please refer: -kernel-source-tree/Documentation/blockdev/zram.rst +kernel-source-tree/Documentation/admin-guide/blockdev/zram.rst -- cgit v1.2.3-55-g7522 From 03638e62f55f27e7a96d6b1175e75b7a81e562b3 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 10 Apr 2019 09:56:05 -0700 Subject: LSM: SafeSetID: rewrite userspace API to atomic updates The current API of the SafeSetID LSM uses one write() per rule, and applies each written rule instantly. This has several downsides: - While a policy is being loaded, once a single parent-child pair has been loaded, the parent is restricted to that specific child, even if subsequent rules would allow transitions to other child UIDs. This means that during policy loading, set*uid() can randomly fail. - To replace the policy without rebooting, it is necessary to first flush all old rules. This creates a time window in which no constraints are placed on the use of CAP_SETUID. - If we want to perform sanity checks on the final policy, this requires that the policy isn't constructed in a piecemeal fashion without telling the kernel when it's done. Other kernel APIs - including things like the userns code and netfilter - avoid this problem by performing updates atomically. Luckily, SafeSetID hasn't landed in a stable (upstream) release yet, so maybe it's not too late to completely change the API. The new API for SafeSetID is: If you want to change the policy, open "safesetid/whitelist_policy" and write the entire policy, newline-delimited, in there. Signed-off-by: Jann Horn Signed-off-by: Micah Morton --- security/safesetid/lsm.c | 84 +++------ security/safesetid/lsm.h | 24 +-- security/safesetid/securityfs.c | 194 ++++++++++++--------- tools/testing/selftests/safesetid/safesetid-test.c | 16 +- 4 files changed, 149 insertions(+), 169 deletions(-) (limited to 'tools/testing') diff --git a/security/safesetid/lsm.c b/security/safesetid/lsm.c index 9db1c7a51d3d..22964e2a6187 100644 --- a/security/safesetid/lsm.c +++ b/security/safesetid/lsm.c @@ -24,28 +24,38 @@ /* Flag indicating whether initialization completed */ int safesetid_initialized; -#define NUM_BITS 8 /* 256 buckets in hash table */ +struct setuid_ruleset __rcu *safesetid_setuid_rules; -static DEFINE_HASHTABLE(safesetid_whitelist_hashtable, NUM_BITS); - -static DEFINE_SPINLOCK(safesetid_whitelist_hashtable_spinlock); - -static enum sid_policy_type setuid_policy_lookup(kuid_t src, kuid_t dst) +/* Compute a decision for a transition from @src to @dst under @policy. */ +enum sid_policy_type _setuid_policy_lookup(struct setuid_ruleset *policy, + kuid_t src, kuid_t dst) { - struct entry *entry; + struct setuid_rule *rule; enum sid_policy_type result = SIDPOL_DEFAULT; - rcu_read_lock(); - hash_for_each_possible_rcu(safesetid_whitelist_hashtable, - entry, next, __kuid_val(src)) { - if (!uid_eq(entry->src_uid, src)) + hash_for_each_possible(policy->rules, rule, next, __kuid_val(src)) { + if (!uid_eq(rule->src_uid, src)) continue; - if (uid_eq(entry->dst_uid, dst)) { - rcu_read_unlock(); + if (uid_eq(rule->dst_uid, dst)) return SIDPOL_ALLOWED; - } result = SIDPOL_CONSTRAINED; } + return result; +} + +/* + * Compute a decision for a transition from @src to @dst under the active + * policy. + */ +static enum sid_policy_type setuid_policy_lookup(kuid_t src, kuid_t dst) +{ + enum sid_policy_type result = SIDPOL_DEFAULT; + struct setuid_ruleset *pol; + + rcu_read_lock(); + pol = rcu_dereference(safesetid_setuid_rules); + if (pol) + result = _setuid_policy_lookup(pol, src, dst); rcu_read_unlock(); return result; } @@ -139,52 +149,6 @@ static int safesetid_task_fix_setuid(struct cred *new, return -EACCES; } -int add_safesetid_whitelist_entry(kuid_t parent, kuid_t child) -{ - struct entry *new; - - /* Return if entry already exists */ - if (setuid_policy_lookup(parent, child) == SIDPOL_ALLOWED) - return 0; - - new = kzalloc(sizeof(struct entry), GFP_KERNEL); - if (!new) - return -ENOMEM; - new->src_uid = parent; - new->dst_uid = child; - spin_lock(&safesetid_whitelist_hashtable_spinlock); - hash_add_rcu(safesetid_whitelist_hashtable, - &new->next, - __kuid_val(parent)); - spin_unlock(&safesetid_whitelist_hashtable_spinlock); - return 0; -} - -void flush_safesetid_whitelist_entries(void) -{ - struct entry *entry; - struct hlist_node *hlist_node; - unsigned int bkt_loop_cursor; - HLIST_HEAD(free_list); - - /* - * Could probably use hash_for_each_rcu here instead, but this should - * be fine as well. - */ - spin_lock(&safesetid_whitelist_hashtable_spinlock); - hash_for_each_safe(safesetid_whitelist_hashtable, bkt_loop_cursor, - hlist_node, entry, next) { - hash_del_rcu(&entry->next); - hlist_add_head(&entry->dlist, &free_list); - } - spin_unlock(&safesetid_whitelist_hashtable_spinlock); - synchronize_rcu(); - hlist_for_each_entry_safe(entry, hlist_node, &free_list, dlist) { - hlist_del(&entry->dlist); - kfree(entry); - } -} - static struct security_hook_list safesetid_security_hooks[] = { LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid), LSM_HOOK_INIT(capable, safesetid_security_capable) diff --git a/security/safesetid/lsm.h b/security/safesetid/lsm.h index 6806f902794c..4a34f558d964 100644 --- a/security/safesetid/lsm.h +++ b/security/safesetid/lsm.h @@ -21,12 +21,6 @@ /* Flag indicating whether initialization completed */ extern int safesetid_initialized; -/* Function type. */ -enum safesetid_whitelist_file_write_type { - SAFESETID_WHITELIST_ADD, /* Add whitelist policy. */ - SAFESETID_WHITELIST_FLUSH, /* Flush whitelist policies. */ -}; - enum sid_policy_type { SIDPOL_DEFAULT, /* source ID is unaffected by policy */ SIDPOL_CONSTRAINED, /* source ID is affected by policy */ @@ -35,18 +29,24 @@ enum sid_policy_type { /* * Hash table entry to store safesetid policy signifying that 'src_uid' - * can setid to 'dst_uid'. + * can setuid to 'dst_uid'. */ -struct entry { +struct setuid_rule { struct hlist_node next; - struct hlist_node dlist; /* for deletion cleanup */ kuid_t src_uid; kuid_t dst_uid; }; -/* Add entry to safesetid whitelist to allow 'parent' to setid to 'child'. */ -int add_safesetid_whitelist_entry(kuid_t parent, kuid_t child); +#define SETID_HASH_BITS 8 /* 256 buckets in hash table */ + +struct setuid_ruleset { + DECLARE_HASHTABLE(rules, SETID_HASH_BITS); + struct rcu_head rcu; +}; + +enum sid_policy_type _setuid_policy_lookup(struct setuid_ruleset *policy, + kuid_t src, kuid_t dst); -void flush_safesetid_whitelist_entries(void); +extern struct setuid_ruleset __rcu *safesetid_setuid_rules; #endif /* _SAFESETID_H */ diff --git a/security/safesetid/securityfs.c b/security/safesetid/securityfs.c index 77d301f0ff7a..250d59e046c1 100644 --- a/security/safesetid/securityfs.c +++ b/security/safesetid/securityfs.c @@ -11,25 +11,15 @@ * published by the Free Software Foundation. * */ + +#define pr_fmt(fmt) "SafeSetID: " fmt + #include #include #include "lsm.h" -static struct dentry *safesetid_policy_dir; - -struct safesetid_file_entry { - const char *name; - enum safesetid_whitelist_file_write_type type; - struct dentry *dentry; -}; - -static struct safesetid_file_entry safesetid_files[] = { - {.name = "add_whitelist_policy", - .type = SAFESETID_WHITELIST_ADD}, - {.name = "flush_whitelist_policies", - .type = SAFESETID_WHITELIST_FLUSH}, -}; +static DEFINE_SPINLOCK(policy_update_lock); /* * In the case the input buffer contains one or more invalid UIDs, the kuid_t @@ -37,8 +27,8 @@ static struct safesetid_file_entry safesetid_files[] = { * function will return an error. * Contents of @buf may be modified. */ -static int parse_policy_line( - struct file *file, char *buf, kuid_t *parent, kuid_t *child) +static int parse_policy_line(struct file *file, char *buf, + struct setuid_rule *rule) { char *child_str; int ret; @@ -59,26 +49,103 @@ static int parse_policy_line( if (ret) return ret; - *parent = make_kuid(file->f_cred->user_ns, parsed_parent); - *child = make_kuid(file->f_cred->user_ns, parsed_child); - if (!uid_valid(*parent) || !uid_valid(*child)) + rule->src_uid = make_kuid(file->f_cred->user_ns, parsed_parent); + rule->dst_uid = make_kuid(file->f_cred->user_ns, parsed_child); + if (!uid_valid(rule->src_uid) || !uid_valid(rule->dst_uid)) return -EINVAL; return 0; } -static int parse_safesetid_whitelist_policy( - struct file *file, const char __user *buf, size_t len, - kuid_t *parent, kuid_t *child) +static void __release_ruleset(struct rcu_head *rcu) { - char *kern_buf = memdup_user_nul(buf, len); - int ret; + struct setuid_ruleset *pol = + container_of(rcu, struct setuid_ruleset, rcu); + int bucket; + struct setuid_rule *rule; + struct hlist_node *tmp; + + hash_for_each_safe(pol->rules, bucket, tmp, rule, next) + kfree(rule); + kfree(pol); +} - if (IS_ERR(kern_buf)) - return PTR_ERR(kern_buf); - ret = parse_policy_line(file, kern_buf, parent, child); - kfree(kern_buf); - return ret; +static void release_ruleset(struct setuid_ruleset *pol) +{ + call_rcu(&pol->rcu, __release_ruleset); +} + +static ssize_t handle_policy_update(struct file *file, + const char __user *ubuf, size_t len) +{ + struct setuid_ruleset *pol; + char *buf, *p, *end; + int err; + + pol = kmalloc(sizeof(struct setuid_ruleset), GFP_KERNEL); + if (!pol) + return -ENOMEM; + hash_init(pol->rules); + + p = buf = memdup_user_nul(ubuf, len); + if (IS_ERR(buf)) { + err = PTR_ERR(buf); + goto out_free_pol; + } + + /* policy lines, including the last one, end with \n */ + while (*p != '\0') { + struct setuid_rule *rule; + + end = strchr(p, '\n'); + if (end == NULL) { + err = -EINVAL; + goto out_free_buf; + } + *end = '\0'; + + rule = kmalloc(sizeof(struct setuid_rule), GFP_KERNEL); + if (!rule) { + err = -ENOMEM; + goto out_free_buf; + } + + err = parse_policy_line(file, p, rule); + if (err) + goto out_free_rule; + + if (_setuid_policy_lookup(pol, rule->src_uid, rule->dst_uid) == + SIDPOL_ALLOWED) { + pr_warn("bad policy: duplicate entry\n"); + err = -EEXIST; + goto out_free_rule; + } + + hash_add(pol->rules, &rule->next, __kuid_val(rule->src_uid)); + p = end + 1; + continue; + +out_free_rule: + kfree(rule); + goto out_free_buf; + } + + /* + * Everything looks good, apply the policy and release the old one. + * What we really want here is an xchg() wrapper for RCU, but since that + * doesn't currently exist, just use a spinlock for now. + */ + spin_lock(&policy_update_lock); + rcu_swap_protected(safesetid_setuid_rules, pol, + lockdep_is_held(&policy_update_lock)); + spin_unlock(&policy_update_lock); + err = len; + +out_free_buf: + kfree(buf); +out_free_pol: + release_ruleset(pol); + return err; } static ssize_t safesetid_file_write(struct file *file, @@ -86,90 +153,45 @@ static ssize_t safesetid_file_write(struct file *file, size_t len, loff_t *ppos) { - struct safesetid_file_entry *file_entry = - file->f_inode->i_private; - kuid_t parent; - kuid_t child; - int ret; - if (!file_ns_capable(file, &init_user_ns, CAP_MAC_ADMIN)) return -EPERM; if (*ppos != 0) return -EINVAL; - switch (file_entry->type) { - case SAFESETID_WHITELIST_FLUSH: - flush_safesetid_whitelist_entries(); - break; - case SAFESETID_WHITELIST_ADD: - ret = parse_safesetid_whitelist_policy(file, buf, len, - &parent, &child); - if (ret) - return ret; - - ret = add_safesetid_whitelist_entry(parent, child); - if (ret) - return ret; - break; - default: - pr_warn("Unknown securityfs file %d\n", file_entry->type); - break; - } - - /* Return len on success so caller won't keep trying to write */ - return len; + return handle_policy_update(file, buf, len); } static const struct file_operations safesetid_file_fops = { .write = safesetid_file_write, }; -static void safesetid_shutdown_securityfs(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(safesetid_files); ++i) { - struct safesetid_file_entry *entry = - &safesetid_files[i]; - securityfs_remove(entry->dentry); - entry->dentry = NULL; - } - - securityfs_remove(safesetid_policy_dir); - safesetid_policy_dir = NULL; -} - static int __init safesetid_init_securityfs(void) { - int i; int ret; + struct dentry *policy_dir; + struct dentry *policy_file; if (!safesetid_initialized) return 0; - safesetid_policy_dir = securityfs_create_dir("safesetid", NULL); - if (IS_ERR(safesetid_policy_dir)) { - ret = PTR_ERR(safesetid_policy_dir); + policy_dir = securityfs_create_dir("safesetid", NULL); + if (IS_ERR(policy_dir)) { + ret = PTR_ERR(policy_dir); goto error; } - for (i = 0; i < ARRAY_SIZE(safesetid_files); ++i) { - struct safesetid_file_entry *entry = - &safesetid_files[i]; - entry->dentry = securityfs_create_file( - entry->name, 0200, safesetid_policy_dir, - entry, &safesetid_file_fops); - if (IS_ERR(entry->dentry)) { - ret = PTR_ERR(entry->dentry); - goto error; - } + policy_file = securityfs_create_file("whitelist_policy", 0200, + policy_dir, NULL, &safesetid_file_fops); + if (IS_ERR(policy_file)) { + ret = PTR_ERR(policy_file); + goto error; } return 0; error: - safesetid_shutdown_securityfs(); + securityfs_remove(policy_dir); return ret; } fs_initcall(safesetid_init_securityfs); diff --git a/tools/testing/selftests/safesetid/safesetid-test.c b/tools/testing/selftests/safesetid/safesetid-test.c index 892c8e8b1b8b..4f03813d1911 100644 --- a/tools/testing/selftests/safesetid/safesetid-test.c +++ b/tools/testing/selftests/safesetid/safesetid-test.c @@ -142,23 +142,17 @@ static void ensure_securityfs_mounted(void) static void write_policies(void) { + static char *policy_str = + "1:2\n" + "1:3\n"; ssize_t written; int fd; fd = open(add_whitelist_policy_file, O_WRONLY); if (fd < 0) die("cant open add_whitelist_policy file\n"); - written = write(fd, "1:2", strlen("1:2")); - if (written != strlen("1:2")) { - if (written >= 0) { - die("short write to %s\n", add_whitelist_policy_file); - } else { - die("write to %s failed: %s\n", - add_whitelist_policy_file, strerror(errno)); - } - } - written = write(fd, "1:3", strlen("1:3")); - if (written != strlen("1:3")) { + written = write(fd, policy_str, strlen(policy_str)); + if (written != strlen(policy_str)) { if (written >= 0) { die("short write to %s\n", add_whitelist_policy_file); } else { -- cgit v1.2.3-55-g7522 From 4f72123da579655855301b591535a1415224f123 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 11 Apr 2019 13:12:43 -0700 Subject: LSM: SafeSetID: verify transitive constrainedness Someone might write a ruleset like the following, expecting that it securely constrains UID 1 to UIDs 1, 2 and 3: 1:2 1:3 However, because no constraints are applied to UIDs 2 and 3, an attacker with UID 1 can simply first switch to UID 2, then switch to any UID from there. The secure way to write this ruleset would be: 1:2 1:3 2:2 3:3 , which uses "transition to self" as a way to inhibit the default-allow policy without allowing anything specific. This is somewhat unintuitive. To make sure that policy authors don't accidentally write insecure policies because of this, let the kernel verify that a new ruleset does not contain any entries that are constrained, but transitively unconstrained. Signed-off-by: Jann Horn Signed-off-by: Micah Morton --- security/safesetid/securityfs.c | 38 +++++++++++++++++++++- tools/testing/selftests/safesetid/safesetid-test.c | 4 ++- 2 files changed, 40 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/security/safesetid/securityfs.c b/security/safesetid/securityfs.c index 997b403c6255..d568e17dd773 100644 --- a/security/safesetid/securityfs.c +++ b/security/safesetid/securityfs.c @@ -76,6 +76,37 @@ static void release_ruleset(struct setuid_ruleset *pol) call_rcu(&pol->rcu, __release_ruleset); } +static void insert_rule(struct setuid_ruleset *pol, struct setuid_rule *rule) +{ + hash_add(pol->rules, &rule->next, __kuid_val(rule->src_uid)); +} + +static int verify_ruleset(struct setuid_ruleset *pol) +{ + int bucket; + struct setuid_rule *rule, *nrule; + int res = 0; + + hash_for_each(pol->rules, bucket, rule, next) { + if (_setuid_policy_lookup(pol, rule->dst_uid, INVALID_UID) == + SIDPOL_DEFAULT) { + pr_warn("insecure policy detected: uid %d is constrained but transitively unconstrained through uid %d\n", + __kuid_val(rule->src_uid), + __kuid_val(rule->dst_uid)); + res = -EINVAL; + + /* fix it up */ + nrule = kmalloc(sizeof(struct setuid_rule), GFP_KERNEL); + if (!nrule) + return -ENOMEM; + nrule->src_uid = rule->dst_uid; + nrule->dst_uid = rule->dst_uid; + insert_rule(pol, nrule); + } + } + return res; +} + static ssize_t handle_policy_update(struct file *file, const char __user *ubuf, size_t len) { @@ -128,7 +159,7 @@ static ssize_t handle_policy_update(struct file *file, goto out_free_rule; } - hash_add(pol->rules, &rule->next, __kuid_val(rule->src_uid)); + insert_rule(pol, rule); p = end + 1; continue; @@ -137,6 +168,11 @@ out_free_rule: goto out_free_buf; } + err = verify_ruleset(pol); + /* bogus policy falls through after fixing it up */ + if (err && err != -EINVAL) + goto out_free_buf; + /* * Everything looks good, apply the policy and release the old one. * What we really want here is an xchg() wrapper for RCU, but since that diff --git a/tools/testing/selftests/safesetid/safesetid-test.c b/tools/testing/selftests/safesetid/safesetid-test.c index 4f03813d1911..8f40c6ecdad1 100644 --- a/tools/testing/selftests/safesetid/safesetid-test.c +++ b/tools/testing/selftests/safesetid/safesetid-test.c @@ -144,7 +144,9 @@ static void write_policies(void) { static char *policy_str = "1:2\n" - "1:3\n"; + "1:3\n" + "2:2\n" + "3:3\n"; ssize_t written; int fd; -- cgit v1.2.3-55-g7522 From c7ca0b614513afba57824cae68447f9c32b1ee61 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 15 Jul 2019 07:21:44 -0700 Subject: Revert "x86/ptrace: Prevent ptrace from clearing the FS/GS selector" and fix the test This reverts commit 48f5e52e916b55fb73754833efbacc7f8081a159. The ptrace ABI change was a prerequisite to the proposed design for FSGSBASE. Since FSGSBASE support has been reverted, and since I'm not convinced that the ABI was ever adequately tested, revert the ABI change as well. This also modifies the test case so that it tests the preexisting behavior. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/fca39c478ea7fb15bc76fe8a36bd180810a067f6.1563200250.git.luto@kernel.org --- arch/x86/kernel/ptrace.c | 14 ++++++++++++-- tools/testing/selftests/x86/fsgsbase.c | 22 ++++------------------ 2 files changed, 16 insertions(+), 20 deletions(-) (limited to 'tools/testing') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 71691a8310e7..0fdbe89d0754 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -369,12 +369,22 @@ static int putreg(struct task_struct *child, case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; - x86_fsbase_write_task(child, value); + /* + * When changing the FS base, use do_arch_prctl_64() + * to set the index to zero and to set the base + * as requested. + */ + if (child->thread.fsbase != value) + return do_arch_prctl_64(child, ARCH_SET_FS, value); return 0; case offsetof(struct user_regs_struct,gs_base): + /* + * Exactly the same here as the %fs handling above. + */ if (value >= TASK_SIZE_MAX) return -EIO; - x86_gsbase_write_task(child, value); + if (child->thread.gsbase != value) + return do_arch_prctl_64(child, ARCH_SET_GS, value); return 0; #endif } diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 5ab4c60c100e..15a329da59fa 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -489,25 +489,11 @@ static void test_ptrace_write_gsbase(void) * selector value is changed or not by the GSBASE write in * a ptracer. */ - if (gs != *shared_scratch) { - nerrs++; - printf("[FAIL]\tGS changed to %lx\n", gs); - - /* - * On older kernels, poking a nonzero value into the - * base would zero the selector. On newer kernels, - * this behavior has changed -- poking the base - * changes only the base and, if FSGSBASE is not - * available, this may have no effect. - */ - if (gs == 0) - printf("\tNote: this is expected behavior on older kernels.\n"); - } else if (have_fsgsbase && (base != 0xFF)) { - nerrs++; - printf("[FAIL]\tGSBASE changed to %lx\n", base); + if (gs == 0 && base == 0xFF) { + printf("[OK]\tGS was reset as expected\n"); } else { - printf("[OK]\tGS remained 0x%hx%s", *shared_scratch, have_fsgsbase ? " and GSBASE changed to 0xFF" : ""); - printf("\n"); + nerrs++; + printf("[FAIL]\tGS=0x%lx, GSBASE=0x%lx (should be 0, 0xFF)\n", gs, base); } } -- cgit v1.2.3-55-g7522 From dd13f3ca642986b59a1863101e58cd694e81f888 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 12 Jul 2019 10:25:56 -0700 Subject: selftests/bpf: add trickier size resolution tests Add more BTF tests, validating that size resolution logic is correct in few trickier cases. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_btf.c | 88 ++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index 8351cb5f4a20..3d617e806054 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -3417,6 +3417,94 @@ static struct btf_raw_test raw_tests[] = { .value_type_id = 1, .max_entries = 4, }, +/* + * typedef int arr_t[16]; + * struct s { + * arr_t *a; + * }; + */ +{ + .descr = "struct->ptr->typedef->array->int size resolution", + .raw_types = { + BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [1] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), + BTF_PTR_ENC(3), /* [2] */ + BTF_TYPEDEF_ENC(NAME_TBD, 4), /* [3] */ + BTF_TYPE_ARRAY_ENC(5, 5, 16), /* [4] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [5] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0s\0a\0arr_t"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "ptr_mod_chain_size_resolve_map", + .key_size = sizeof(int), + .value_size = sizeof(int) * 16, + .key_type_id = 5 /* int */, + .value_type_id = 3 /* arr_t */, + .max_entries = 4, +}, +/* + * typedef int arr_t[16][8][4]; + * struct s { + * arr_t *a; + * }; + */ +{ + .descr = "struct->ptr->typedef->multi-array->int size resolution", + .raw_types = { + BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [1] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), + BTF_PTR_ENC(3), /* [2] */ + BTF_TYPEDEF_ENC(NAME_TBD, 4), /* [3] */ + BTF_TYPE_ARRAY_ENC(5, 7, 16), /* [4] */ + BTF_TYPE_ARRAY_ENC(6, 7, 8), /* [5] */ + BTF_TYPE_ARRAY_ENC(7, 7, 4), /* [6] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [7] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0s\0a\0arr_t"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "multi_arr_size_resolve_map", + .key_size = sizeof(int), + .value_size = sizeof(int) * 16 * 8 * 4, + .key_type_id = 7 /* int */, + .value_type_id = 3 /* arr_t */, + .max_entries = 4, +}, +/* + * typedef int int_t; + * typedef int_t arr3_t[4]; + * typedef arr3_t arr2_t[8]; + * typedef arr2_t arr1_t[16]; + * struct s { + * arr1_t *a; + * }; + */ +{ + .descr = "typedef/multi-arr mix size resolution", + .raw_types = { + BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [1] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), + BTF_PTR_ENC(3), /* [2] */ + BTF_TYPEDEF_ENC(NAME_TBD, 4), /* [3] */ + BTF_TYPE_ARRAY_ENC(5, 10, 16), /* [4] */ + BTF_TYPEDEF_ENC(NAME_TBD, 6), /* [5] */ + BTF_TYPE_ARRAY_ENC(7, 10, 8), /* [6] */ + BTF_TYPEDEF_ENC(NAME_TBD, 8), /* [7] */ + BTF_TYPE_ARRAY_ENC(9, 10, 4), /* [8] */ + BTF_TYPEDEF_ENC(NAME_TBD, 10), /* [9] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [10] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0s\0a\0arr1_t\0arr2_t\0arr3_t\0int_t"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "typedef_arra_mix_size_resolve_map", + .key_size = sizeof(int), + .value_size = sizeof(int) * 16 * 8 * 4, + .key_type_id = 10 /* int */, + .value_type_id = 3 /* arr_t */, + .max_entries = 4, +}, }; /* struct btf_raw_test raw_tests[] */ -- cgit v1.2.3-55-g7522 From 8981e56fa17282598571958ae6a29cbc3209a6cb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 12 Jul 2019 10:25:57 -0700 Subject: selftests/bpf: use typedef'ed arrays as map values Convert few tests that couldn't use typedef'ed arrays due to kernel bug. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c | 3 ++- tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c | 3 +-- tools/testing/selftests/bpf/progs/test_stacktrace_map.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c index d06b47a09097..33254b771384 100644 --- a/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c +++ b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c @@ -47,11 +47,12 @@ struct { * issue and avoid complicated C programming massaging. * This is an acceptable workaround since there is one entry here. */ +typedef __u64 raw_stack_trace_t[2 * MAX_STACK_RAWTP]; struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __uint(max_entries, 1); __type(key, __u32); - __u64 (*value)[2 * MAX_STACK_RAWTP]; + __type(value, raw_stack_trace_t); } rawdata_map SEC(".maps"); SEC("tracepoint/raw_syscalls/sys_enter") diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c index bbfc8337b6f0..f5638e26865d 100644 --- a/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c +++ b/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c @@ -36,8 +36,7 @@ struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 128); __type(key, __u32); - /* there seems to be a bug in kernel not handling typedef properly */ - struct bpf_stack_build_id (*value)[PERF_MAX_STACK_DEPTH]; + __type(value, stack_trace_t); } stack_amap SEC(".maps"); /* taken from /sys/kernel/debug/tracing/events/random/urandom_read/format */ diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c index 803c15dc109d..fa0be3e10a10 100644 --- a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c +++ b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c @@ -35,7 +35,7 @@ struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 16384); __type(key, __u32); - __u64 (*value)[PERF_MAX_STACK_DEPTH]; + __type(value, stack_trace_t); } stack_amap SEC(".maps"); /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */ -- cgit v1.2.3-55-g7522 From 025c0c0917b78f21e3aaffecda75968d54172095 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 12 Jul 2019 15:41:42 +0200 Subject: selftests/bpf: fix attach_probe on s390 attach_probe test fails, because it cannot install a kprobe on a non-existent sys_nanosleep symbol. Use the correct symbol name for the nanosleep syscall on 64-bit s390. Don't bother adding one for 31-bit mode, since tests are compiled only in 64-bit mode. Fixes: 1e8611bbdfc9 ("selftests/bpf: add kprobe/uprobe selftests") Signed-off-by: Ilya Leoshkevich Acked-by: Vasily Gorbik Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/prog_tests/attach_probe.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index a4686395522c..47af4afc5013 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -23,6 +23,8 @@ ssize_t get_base_addr() { #ifdef __x86_64__ #define SYS_KPROBE_NAME "__x64_sys_nanosleep" +#elif defined(__s390x__) +#define SYS_KPROBE_NAME "__s390x_sys_nanosleep" #else #define SYS_KPROBE_NAME "sys_nanosleep" #endif -- cgit v1.2.3-55-g7522 From e46fc22e60a410d896448835adca95aa3332d25d Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 12 Jul 2019 15:56:31 +0200 Subject: selftests/bpf: make directory prerequisites order-only When directories are used as prerequisites in Makefiles, they can cause a lot of unnecessary rebuilds, because a directory is considered changed whenever a file in this directory is added, removed or modified. If the only thing a target is interested in is the existence of the directory it depends on, which is the case for selftests/bpf, this directory should be specified as an order-only prerequisite: it would still be created in case it does not exist, but it would not trigger a rebuild of a target in case it's considered changed. Signed-off-by: Ilya Leoshkevich Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/Makefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 277d8605e340..0e003fb6641b 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -183,12 +183,12 @@ TEST_CUSTOM_PROGS += $(ALU32_BUILD_DIR)/test_progs_32 $(ALU32_BUILD_DIR): mkdir -p $@ -$(ALU32_BUILD_DIR)/urandom_read: $(OUTPUT)/urandom_read +$(ALU32_BUILD_DIR)/urandom_read: $(OUTPUT)/urandom_read | $(ALU32_BUILD_DIR) cp $< $@ $(ALU32_BUILD_DIR)/test_progs_32: test_progs.c $(OUTPUT)/libbpf.a\ - $(ALU32_BUILD_DIR) \ - $(ALU32_BUILD_DIR)/urandom_read + $(ALU32_BUILD_DIR)/urandom_read \ + | $(ALU32_BUILD_DIR) $(CC) $(TEST_PROGS_CFLAGS) $(CFLAGS) \ -o $(ALU32_BUILD_DIR)/test_progs_32 \ test_progs.c test_stub.c trace_helpers.c prog_tests/*.c \ @@ -197,8 +197,8 @@ $(ALU32_BUILD_DIR)/test_progs_32: test_progs.c $(OUTPUT)/libbpf.a\ $(ALU32_BUILD_DIR)/test_progs_32: $(PROG_TESTS_H) $(ALU32_BUILD_DIR)/test_progs_32: prog_tests/*.c -$(ALU32_BUILD_DIR)/%.o: progs/%.c $(ALU32_BUILD_DIR) \ - $(ALU32_BUILD_DIR)/test_progs_32 +$(ALU32_BUILD_DIR)/%.o: progs/%.c $(ALU32_BUILD_DIR)/test_progs_32 \ + | $(ALU32_BUILD_DIR) ($(CLANG) $(CLANG_FLAGS) -O2 -target bpf -emit-llvm -c $< -o - || \ echo "clang failed") | \ $(LLC) -march=bpf -mattr=+alu32 -mcpu=$(CPU) $(LLC_FLAGS) \ @@ -236,7 +236,7 @@ $(PROG_TESTS_DIR): mkdir -p $@ PROG_TESTS_FILES := $(wildcard prog_tests/*.c) -$(PROG_TESTS_H): $(PROG_TESTS_DIR) $(PROG_TESTS_FILES) +$(PROG_TESTS_H): $(PROG_TESTS_FILES) | $(PROG_TESTS_DIR) $(shell ( cd prog_tests/; \ echo '/* Generated header, do not edit */'; \ echo '#ifdef DECLARE'; \ @@ -257,7 +257,7 @@ MAP_TESTS_H := $(MAP_TESTS_DIR)/tests.h test_maps.c: $(MAP_TESTS_H) $(OUTPUT)/test_maps: CFLAGS += $(TEST_MAPS_CFLAGS) MAP_TESTS_FILES := $(wildcard map_tests/*.c) -$(MAP_TESTS_H): $(MAP_TESTS_DIR) $(MAP_TESTS_FILES) +$(MAP_TESTS_H): $(MAP_TESTS_FILES) | $(MAP_TESTS_DIR) $(shell ( cd map_tests/; \ echo '/* Generated header, do not edit */'; \ echo '#ifdef DECLARE'; \ @@ -279,7 +279,7 @@ $(VERIFIER_TESTS_DIR): mkdir -p $@ VERIFIER_TEST_FILES := $(wildcard verifier/*.c) -$(OUTPUT)/verifier/tests.h: $(VERIFIER_TESTS_DIR) $(VERIFIER_TEST_FILES) +$(OUTPUT)/verifier/tests.h: $(VERIFIER_TEST_FILES) | $(VERIFIER_TESTS_DIR) $(shell ( cd verifier/; \ echo '/* Generated header, do not edit */'; \ echo '#ifdef FILL_ARRAY'; \ -- cgit v1.2.3-55-g7522 From f83a46d4711e789642033eb00fed8e017e34fe7d Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 12 Jul 2019 15:59:50 +0200 Subject: selftests/bpf: put test_stub.o into $(OUTPUT) Add a rule to put test_stub.o in $(OUTPUT) and change the references to it accordingly. This prevents test_stub.o from being created in the source directory. Signed-off-by: Ilya Leoshkevich Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/Makefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 0e003fb6641b..1296253b3422 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -83,13 +83,16 @@ all: $(TEST_CUSTOM_PROGS) $(OUTPUT)/urandom_read: $(OUTPUT)/%: %.c $(CC) -o $@ $< -Wl,--build-id +$(OUTPUT)/test_stub.o: test_stub.c + $(CC) $(TEST_PROGS_CFLAGS) $(CFLAGS) -c -o $@ $< + $(OUTPUT)/test_maps: map_tests/*.c BPFOBJ := $(OUTPUT)/libbpf.a -$(TEST_GEN_PROGS): test_stub.o $(BPFOBJ) +$(TEST_GEN_PROGS): $(OUTPUT)/test_stub.o $(BPFOBJ) -$(TEST_GEN_PROGS_EXTENDED): test_stub.o $(OUTPUT)/libbpf.a +$(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(OUTPUT)/libbpf.a $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c $(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c -- cgit v1.2.3-55-g7522 From 8b45063c8584b3e6caff0b109dd0f47b35487aba Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 15 Jul 2019 09:39:54 -0700 Subject: selftests/bpf: rename verifier/wide_store.c to verifier/wide_access.c Move the file and rename internal BPF_SOCK_ADDR define to BPF_SOCK_ADDR_STORE. This selftest will be extended in the next commit with the wide loads. Cc: Yonghong Song Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/verifier/wide_access.c | 36 ++++++++++++++++++++++ tools/testing/selftests/bpf/verifier/wide_store.c | 36 ---------------------- 2 files changed, 36 insertions(+), 36 deletions(-) create mode 100644 tools/testing/selftests/bpf/verifier/wide_access.c delete mode 100644 tools/testing/selftests/bpf/verifier/wide_store.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/verifier/wide_access.c b/tools/testing/selftests/bpf/verifier/wide_access.c new file mode 100644 index 000000000000..3ac97328432f --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/wide_access.c @@ -0,0 +1,36 @@ +#define BPF_SOCK_ADDR_STORE(field, off, res, err) \ +{ \ + "wide store to bpf_sock_addr." #field "[" #off "]", \ + .insns = { \ + BPF_MOV64_IMM(BPF_REG_0, 1), \ + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, \ + offsetof(struct bpf_sock_addr, field[off])), \ + BPF_EXIT_INSN(), \ + }, \ + .result = res, \ + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, \ + .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, \ + .errstr = err, \ +} + +/* user_ip6[0] is u64 aligned */ +BPF_SOCK_ADDR_STORE(user_ip6, 0, ACCEPT, + NULL), +BPF_SOCK_ADDR_STORE(user_ip6, 1, REJECT, + "invalid bpf_context access off=12 size=8"), +BPF_SOCK_ADDR_STORE(user_ip6, 2, ACCEPT, + NULL), +BPF_SOCK_ADDR_STORE(user_ip6, 3, REJECT, + "invalid bpf_context access off=20 size=8"), + +/* msg_src_ip6[0] is _not_ u64 aligned */ +BPF_SOCK_ADDR_STORE(msg_src_ip6, 0, REJECT, + "invalid bpf_context access off=44 size=8"), +BPF_SOCK_ADDR_STORE(msg_src_ip6, 1, ACCEPT, + NULL), +BPF_SOCK_ADDR_STORE(msg_src_ip6, 2, REJECT, + "invalid bpf_context access off=52 size=8"), +BPF_SOCK_ADDR_STORE(msg_src_ip6, 3, REJECT, + "invalid bpf_context access off=56 size=8"), + +#undef BPF_SOCK_ADDR_STORE diff --git a/tools/testing/selftests/bpf/verifier/wide_store.c b/tools/testing/selftests/bpf/verifier/wide_store.c deleted file mode 100644 index 8fe99602ded4..000000000000 --- a/tools/testing/selftests/bpf/verifier/wide_store.c +++ /dev/null @@ -1,36 +0,0 @@ -#define BPF_SOCK_ADDR(field, off, res, err) \ -{ \ - "wide store to bpf_sock_addr." #field "[" #off "]", \ - .insns = { \ - BPF_MOV64_IMM(BPF_REG_0, 1), \ - BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, \ - offsetof(struct bpf_sock_addr, field[off])), \ - BPF_EXIT_INSN(), \ - }, \ - .result = res, \ - .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, \ - .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, \ - .errstr = err, \ -} - -/* user_ip6[0] is u64 aligned */ -BPF_SOCK_ADDR(user_ip6, 0, ACCEPT, - NULL), -BPF_SOCK_ADDR(user_ip6, 1, REJECT, - "invalid bpf_context access off=12 size=8"), -BPF_SOCK_ADDR(user_ip6, 2, ACCEPT, - NULL), -BPF_SOCK_ADDR(user_ip6, 3, REJECT, - "invalid bpf_context access off=20 size=8"), - -/* msg_src_ip6[0] is _not_ u64 aligned */ -BPF_SOCK_ADDR(msg_src_ip6, 0, REJECT, - "invalid bpf_context access off=44 size=8"), -BPF_SOCK_ADDR(msg_src_ip6, 1, ACCEPT, - NULL), -BPF_SOCK_ADDR(msg_src_ip6, 2, REJECT, - "invalid bpf_context access off=52 size=8"), -BPF_SOCK_ADDR(msg_src_ip6, 3, REJECT, - "invalid bpf_context access off=56 size=8"), - -#undef BPF_SOCK_ADDR -- cgit v1.2.3-55-g7522 From 7dd8d6119d481da1c4619eb7d8cfef33edfbee81 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 15 Jul 2019 09:39:55 -0700 Subject: selftests/bpf: add selftests for wide loads Mirror existing wide store tests with wide loads. The only significant difference is expected error string. Cc: Yonghong Song Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/verifier/wide_access.c | 37 ++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/verifier/wide_access.c b/tools/testing/selftests/bpf/verifier/wide_access.c index 3ac97328432f..ccade9312d21 100644 --- a/tools/testing/selftests/bpf/verifier/wide_access.c +++ b/tools/testing/selftests/bpf/verifier/wide_access.c @@ -34,3 +34,40 @@ BPF_SOCK_ADDR_STORE(msg_src_ip6, 3, REJECT, "invalid bpf_context access off=56 size=8"), #undef BPF_SOCK_ADDR_STORE + +#define BPF_SOCK_ADDR_LOAD(field, off, res, err) \ +{ \ + "wide load from bpf_sock_addr." #field "[" #off "]", \ + .insns = { \ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, \ + offsetof(struct bpf_sock_addr, field[off])), \ + BPF_MOV64_IMM(BPF_REG_0, 1), \ + BPF_EXIT_INSN(), \ + }, \ + .result = res, \ + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, \ + .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, \ + .errstr = err, \ +} + +/* user_ip6[0] is u64 aligned */ +BPF_SOCK_ADDR_LOAD(user_ip6, 0, ACCEPT, + NULL), +BPF_SOCK_ADDR_LOAD(user_ip6, 1, REJECT, + "invalid bpf_context access off=12 size=8"), +BPF_SOCK_ADDR_LOAD(user_ip6, 2, ACCEPT, + NULL), +BPF_SOCK_ADDR_LOAD(user_ip6, 3, REJECT, + "invalid bpf_context access off=20 size=8"), + +/* msg_src_ip6[0] is _not_ u64 aligned */ +BPF_SOCK_ADDR_LOAD(msg_src_ip6, 0, REJECT, + "invalid bpf_context access off=44 size=8"), +BPF_SOCK_ADDR_LOAD(msg_src_ip6, 1, ACCEPT, + NULL), +BPF_SOCK_ADDR_LOAD(msg_src_ip6, 2, REJECT, + "invalid bpf_context access off=52 size=8"), +BPF_SOCK_ADDR_LOAD(msg_src_ip6, 3, REJECT, + "invalid bpf_context access off=56 size=8"), + +#undef BPF_SOCK_ADDR_LOAD -- cgit v1.2.3-55-g7522 From d5e1db990fcce571a4af2434d6fc2b312d94b45e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 12 Jul 2019 10:44:41 -0700 Subject: selftests/bpf: remove logic duplication in test_verifier test_verifier tests can specify single- and multi-runs tests. Internally logic of handling them is duplicated. Get rid of it by making single run retval/data specification to be a first run spec. Signed-off-by: Andrii Nakryiko Cc: Krzesimir Nowak Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_verifier.c | 35 +++++++++++++---------------- 1 file changed, 15 insertions(+), 20 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index b0773291012a..84135d5f4b35 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -86,7 +86,7 @@ struct bpf_test { int fixup_sk_storage_map[MAX_FIXUPS]; const char *errstr; const char *errstr_unpriv; - uint32_t retval, retval_unpriv, insn_processed; + uint32_t insn_processed; int prog_len; enum { UNDEF, @@ -95,16 +95,20 @@ struct bpf_test { } result, result_unpriv; enum bpf_prog_type prog_type; uint8_t flags; - __u8 data[TEST_DATA_LEN]; void (*fill_helper)(struct bpf_test *self); uint8_t runs; - struct { - uint32_t retval, retval_unpriv; - union { - __u8 data[TEST_DATA_LEN]; - __u64 data64[TEST_DATA_LEN / 8]; - }; - } retvals[MAX_TEST_RUNS]; +#define bpf_testdata_struct_t \ + struct { \ + uint32_t retval, retval_unpriv; \ + union { \ + __u8 data[TEST_DATA_LEN]; \ + __u64 data64[TEST_DATA_LEN / 8]; \ + }; \ + } + union { + bpf_testdata_struct_t; + bpf_testdata_struct_t retvals[MAX_TEST_RUNS]; + }; enum bpf_attach_type expected_attach_type; }; @@ -949,17 +953,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv, uint32_t expected_val; int i; - if (!test->runs) { - expected_val = unpriv && test->retval_unpriv ? - test->retval_unpriv : test->retval; - - err = do_prog_test_run(fd_prog, unpriv, expected_val, - test->data, sizeof(test->data)); - if (err) - run_errs++; - else - run_successes++; - } + if (!test->runs) + test->runs = 1; for (i = 0; i < test->runs; i++) { if (unpriv && test->retvals[i].retval_unpriv) -- cgit v1.2.3-55-g7522 From 3461a0a02141b2612f2916e9250a430de41b0290 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 16 Jul 2019 12:53:53 +0200 Subject: selftests/bpf: fix "alu with different scalars 1" on s390 BPF_LDX_MEM is used to load the least significant byte of the retrieved test_val.index, however, on big-endian machines it ends up retrieving the most significant byte. Change the test to load the whole int in order to make it endianness-independent. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/value_ptr_arith.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index c3de1a2c9dc5..a53d99cebd9f 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -183,7 +183,7 @@ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), - BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 3), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_IMM(BPF_REG_3, 0x100000), -- cgit v1.2.3-55-g7522 From 4e59afbbed9638f95adfac8d2f222aca3b2b1bc0 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 16 Jul 2019 12:56:34 +0200 Subject: selftests/bpf: skip nmi test when perf hw events are disabled Some setups (e.g. virtual machines) might run with hardware perf events disabled. If this is the case, skip the test_send_signal_nmi test. Add a separate test involving a software perf event. This allows testing the perf event path regardless of hardware perf event support. Signed-off-by: Ilya Leoshkevich Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/send_signal.c | 33 +++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c index 67cea1686305..54218ee3c004 100644 --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c @@ -173,6 +173,18 @@ static int test_send_signal_tracepoint(void) return test_send_signal_common(&attr, BPF_PROG_TYPE_TRACEPOINT, "tracepoint"); } +static int test_send_signal_perf(void) +{ + struct perf_event_attr attr = { + .sample_period = 1, + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_CPU_CLOCK, + }; + + return test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT, + "perf_sw_event"); +} + static int test_send_signal_nmi(void) { struct perf_event_attr attr = { @@ -181,8 +193,26 @@ static int test_send_signal_nmi(void) .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, }; + int pmu_fd; + + /* Some setups (e.g. virtual machines) might run with hardware + * perf events disabled. If this is the case, skip this test. + */ + pmu_fd = syscall(__NR_perf_event_open, &attr, 0 /* pid */, + -1 /* cpu */, -1 /* group_fd */, 0 /* flags */); + if (pmu_fd == -1) { + if (errno == ENOENT) { + printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", + __func__); + return 0; + } + /* Let the test fail with a more informative message */ + } else { + close(pmu_fd); + } - return test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT, "perf_event"); + return test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT, + "perf_hw_event"); } void test_send_signal(void) @@ -190,6 +220,7 @@ void test_send_signal(void) int ret = 0; ret |= test_send_signal_tracepoint(); + ret |= test_send_signal_perf(); ret |= test_send_signal_nmi(); if (!ret) printf("test_send_signal:OK\n"); -- cgit v1.2.3-55-g7522 From cbd965bde74c3d0e53798404fd4b9829a68ccec8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt (VMware) Date: Wed, 3 Jul 2019 14:40:42 -0400 Subject: ftrace/selftests: Return the skip code when tracing directory not configured in kernel If the kernel is not configured with ftrace enabled, the ftracetest selftests should return the error code of "4" as that is the kselftests "skip" code, and not "1" which means an error. To determine if ftrace is enabled, first the newer "tracefs" is searched for in /proc/mounts. If it is not found, then "debugfs" is searched for (as old kernels do not have tracefs). If that is not found, an attempt to mount the tracefs or debugfs is performed. This is done by seeing first if the /sys/kernel/tracing directory exists. If it does than tracefs is configured in the kernel and an attempt to mount it is performed. If /sys/kernel/tracing does not exist, then /sys/kernel/debug is tested to see if that directory exists. If it does, then an attempt to mount debugfs on that directory is performed. If it does not exist, then debugfs is not configured in the running kernel and the test exits with the skip code. If either mount fails, then a normal error is returned as they do exist in the kernel but something went wrong to mount them. This changes the test to always try the tracefs file system first as it has been in the kernel for some time now and it is better to test it if it is available instead of always testing debugfs. Link: http://lkml.kernel.org/r/20190702062358.7330-1-po-hsu.lin@canonical.com Reported-by: Po-Hsu Lin Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/testing/selftests/ftrace/ftracetest | 38 ++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 6 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index 136387422b00..edf5ea35d2a7 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -23,9 +23,15 @@ echo " If is -, all logs output in console only" exit $1 } +# default error +err_ret=1 + +# kselftest skip code is 4 +err_skip=4 + errexit() { # message echo "Error: $1" 1>&2 - exit 1 + exit $err_ret } # Ensuring user privilege @@ -116,11 +122,31 @@ parse_opts() { # opts } # Parameters -DEBUGFS_DIR=`grep debugfs /proc/mounts | cut -f2 -d' ' | head -1` -if [ -z "$DEBUGFS_DIR" ]; then - TRACING_DIR=`grep tracefs /proc/mounts | cut -f2 -d' ' | head -1` -else - TRACING_DIR=$DEBUGFS_DIR/tracing +TRACING_DIR=`grep tracefs /proc/mounts | cut -f2 -d' ' | head -1` +if [ -z "$TRACING_DIR" ]; then + DEBUGFS_DIR=`grep debugfs /proc/mounts | cut -f2 -d' ' | head -1` + if [ -z "$DEBUGFS_DIR" ]; then + # If tracefs exists, then so does /sys/kernel/tracing + if [ -d "/sys/kernel/tracing" ]; then + mount -t tracefs nodev /sys/kernel/tracing || + errexit "Failed to mount /sys/kernel/tracing" + TRACING_DIR="/sys/kernel/tracing" + # If debugfs exists, then so does /sys/kernel/debug + elif [ -d "/sys/kernel/debug" ]; then + mount -t debugfs nodev /sys/kernel/debug || + errexit "Failed to mount /sys/kernel/debug" + TRACING_DIR="/sys/kernel/debug/tracing" + else + err_ret=$err_skip + errexit "debugfs and tracefs are not configured in this kernel" + fi + else + TRACING_DIR="$DEBUGFS_DIR/tracing" + fi +fi +if [ ! -d "$TRACING_DIR" ]; then + err_ret=$err_skip + errexit "ftrace is not configured in this kernel" fi TOP_DIR=`absdir $0` -- cgit v1.2.3-55-g7522 From 6e55f320f00e4db7144a4906dd8ac53701891e31 Mon Sep 17 00:00:00 2001 From: Steven Rostedt (VMware) Date: Wed, 3 Jul 2019 15:46:08 -0400 Subject: ftrace/selftest: Test if set_event/ftrace_pid exists before writing While testing on a very old kernel (3.5), the tests failed because the write to set_event_pid in the setup code, did not exist. The tests themselves could pass, but the setup failed causing an error. Other files test for existance before writing to them. Do the same for set_event_pid and set_ftrace_pid. Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/testing/selftests/ftrace/test.d/functions | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions index 779ec11f61bd..1d96c5f7e402 100644 --- a/tools/testing/selftests/ftrace/test.d/functions +++ b/tools/testing/selftests/ftrace/test.d/functions @@ -91,8 +91,8 @@ initialize_ftrace() { # Reset ftrace to initial-state reset_events_filter reset_ftrace_filter disable_events - echo > set_event_pid # event tracer is always on - echo > set_ftrace_pid + [ -f set_event_pid ] && echo > set_event_pid + [ -f set_ftrace_pid ] && echo > set_ftrace_pid [ -f set_ftrace_filter ] && echo | tee set_ftrace_* [ -f set_graph_function ] && echo | tee set_graph_* [ -f stack_trace_filter ] && echo > stack_trace_filter -- cgit v1.2.3-55-g7522 From 36646b22ce2455c25c76c9023a0e5ca58d62899a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 16 Jul 2019 12:38:36 -0700 Subject: selftests/bpf: fix test_verifier/test_maps make dependencies e46fc22e60a4 ("selftests/bpf: make directory prerequisites order-only") exposed existing problem in Makefile for test_verifier and test_maps tests: their dependency on auto-generated header file with a list of all tests wasn't recorded explicitly. This patch fixes these issues. Fixes: 51a0e301a563 ("bpf: Add BPF_MAP_TYPE_SK_STORAGE test to test_maps") Fixes: 6b7b6995c43e ("selftests: bpf: tests.h should depend on .c files, not the output") Cc: Ilya Leoshkevich Cc: Stanislav Fomichev Cc: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 1296253b3422..9bc68d8abc5f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -86,8 +86,6 @@ $(OUTPUT)/urandom_read: $(OUTPUT)/%: %.c $(OUTPUT)/test_stub.o: test_stub.c $(CC) $(TEST_PROGS_CFLAGS) $(CFLAGS) -c -o $@ $< -$(OUTPUT)/test_maps: map_tests/*.c - BPFOBJ := $(OUTPUT)/libbpf.a $(TEST_GEN_PROGS): $(OUTPUT)/test_stub.o $(BPFOBJ) @@ -257,9 +255,10 @@ MAP_TESTS_DIR = $(OUTPUT)/map_tests $(MAP_TESTS_DIR): mkdir -p $@ MAP_TESTS_H := $(MAP_TESTS_DIR)/tests.h +MAP_TESTS_FILES := $(wildcard map_tests/*.c) test_maps.c: $(MAP_TESTS_H) $(OUTPUT)/test_maps: CFLAGS += $(TEST_MAPS_CFLAGS) -MAP_TESTS_FILES := $(wildcard map_tests/*.c) +$(OUTPUT)/test_maps: test_maps.c $(MAP_TESTS_H) $(MAP_TESTS_FILES) $(MAP_TESTS_H): $(MAP_TESTS_FILES) | $(MAP_TESTS_DIR) $(shell ( cd map_tests/; \ echo '/* Generated header, do not edit */'; \ @@ -276,6 +275,7 @@ $(MAP_TESTS_H): $(MAP_TESTS_FILES) | $(MAP_TESTS_DIR) VERIFIER_TESTS_H := $(OUTPUT)/verifier/tests.h test_verifier.c: $(VERIFIER_TESTS_H) $(OUTPUT)/test_verifier: CFLAGS += $(TEST_VERIFIER_CFLAGS) +$(OUTPUT)/test_verifier: test_verifier.c $(VERIFIER_TESTS_H) VERIFIER_TESTS_DIR = $(OUTPUT)/verifier $(VERIFIER_TESTS_DIR): -- cgit v1.2.3-55-g7522 From 9d1f62a6dcf0de3ab184c1f7b7d82117dcbab090 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 16 Jul 2019 12:38:37 -0700 Subject: selftests/bpf: structure test_{progs, maps, verifier} test runners uniformly It's easier to follow the logic if it's structured the same. There is just slight difference between test_progs/test_maps and test_verifier. test_verifier's verifier/*.c files are not really compilable C files (they are more of include headers), so they can't be specified as explicit dependencies of test_verifier. Cc: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 9bc68d8abc5f..11c9c62c3362 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -176,6 +176,7 @@ endif endif TEST_PROGS_CFLAGS := -I. -I$(OUTPUT) +TEST_MAPS_CFLAGS := -I. -I$(OUTPUT) TEST_VERIFIER_CFLAGS := -I. -I$(OUTPUT) -Iverifier ifneq ($(SUBREG_CODEGEN),) @@ -227,16 +228,14 @@ ifeq ($(DWARF2BTF),y) $(BTF_PAHOLE) -J $@ endif -PROG_TESTS_H := $(OUTPUT)/prog_tests/tests.h -test_progs.c: $(PROG_TESTS_H) -$(OUTPUT)/test_progs: CFLAGS += $(TEST_PROGS_CFLAGS) -$(OUTPUT)/test_progs: prog_tests/*.c - PROG_TESTS_DIR = $(OUTPUT)/prog_tests $(PROG_TESTS_DIR): mkdir -p $@ - +PROG_TESTS_H := $(PROG_TESTS_DIR)/tests.h PROG_TESTS_FILES := $(wildcard prog_tests/*.c) +test_progs.c: $(PROG_TESTS_H) +$(OUTPUT)/test_progs: CFLAGS += $(TEST_PROGS_CFLAGS) +$(OUTPUT)/test_progs: test_progs.c $(PROG_TESTS_H) $(PROG_TESTS_FILES) $(PROG_TESTS_H): $(PROG_TESTS_FILES) | $(PROG_TESTS_DIR) $(shell ( cd prog_tests/; \ echo '/* Generated header, do not edit */'; \ @@ -250,7 +249,6 @@ $(PROG_TESTS_H): $(PROG_TESTS_FILES) | $(PROG_TESTS_DIR) echo '#endif' \ ) > $(PROG_TESTS_H)) -TEST_MAPS_CFLAGS := -I. -I$(OUTPUT) MAP_TESTS_DIR = $(OUTPUT)/map_tests $(MAP_TESTS_DIR): mkdir -p $@ @@ -272,17 +270,15 @@ $(MAP_TESTS_H): $(MAP_TESTS_FILES) | $(MAP_TESTS_DIR) echo '#endif' \ ) > $(MAP_TESTS_H)) -VERIFIER_TESTS_H := $(OUTPUT)/verifier/tests.h -test_verifier.c: $(VERIFIER_TESTS_H) -$(OUTPUT)/test_verifier: CFLAGS += $(TEST_VERIFIER_CFLAGS) -$(OUTPUT)/test_verifier: test_verifier.c $(VERIFIER_TESTS_H) - VERIFIER_TESTS_DIR = $(OUTPUT)/verifier $(VERIFIER_TESTS_DIR): mkdir -p $@ - +VERIFIER_TESTS_H := $(VERIFIER_TESTS_DIR)/tests.h VERIFIER_TEST_FILES := $(wildcard verifier/*.c) -$(OUTPUT)/verifier/tests.h: $(VERIFIER_TEST_FILES) | $(VERIFIER_TESTS_DIR) +test_verifier.c: $(VERIFIER_TESTS_H) +$(OUTPUT)/test_verifier: CFLAGS += $(TEST_VERIFIER_CFLAGS) +$(OUTPUT)/test_verifier: test_verifier.c $(VERIFIER_TESTS_H) +$(VERIFIER_TESTS_H): $(VERIFIER_TEST_FILES) | $(VERIFIER_TESTS_DIR) $(shell ( cd verifier/; \ echo '/* Generated header, do not edit */'; \ echo '#ifdef FILL_ARRAY'; \ -- cgit v1.2.3-55-g7522 From 1cb59a6074e23c6f6513642f752a6d8d38327354 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 16 Jul 2019 14:58:27 +0200 Subject: selftests/bpf: fix perf_buffer on s390 perf_buffer test fails for exactly the same reason test_attach_probe used to fail: different nanosleep syscall kprobe name. Reuse the test_attach_probe fix. Fixes: ee5cf82ce04a ("selftests/bpf: test perf buffer API") Signed-off-by: Ilya Leoshkevich Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/attach_probe.c | 12 ++---------- tools/testing/selftests/bpf/prog_tests/perf_buffer.c | 8 +------- tools/testing/selftests/bpf/test_progs.h | 8 ++++++++ 3 files changed, 11 insertions(+), 17 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index 47af4afc5013..5ecc267d98b0 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -21,14 +21,6 @@ ssize_t get_base_addr() { return -EINVAL; } -#ifdef __x86_64__ -#define SYS_KPROBE_NAME "__x64_sys_nanosleep" -#elif defined(__s390x__) -#define SYS_KPROBE_NAME "__s390x_sys_nanosleep" -#else -#define SYS_KPROBE_NAME "sys_nanosleep" -#endif - void test_attach_probe(void) { const char *kprobe_name = "kprobe/sys_nanosleep"; @@ -86,7 +78,7 @@ void test_attach_probe(void) kprobe_link = bpf_program__attach_kprobe(kprobe_prog, false /* retprobe */, - SYS_KPROBE_NAME); + SYS_NANOSLEEP_KPROBE_NAME); if (CHECK(IS_ERR(kprobe_link), "attach_kprobe", "err %ld\n", PTR_ERR(kprobe_link))) { kprobe_link = NULL; @@ -94,7 +86,7 @@ void test_attach_probe(void) } kretprobe_link = bpf_program__attach_kprobe(kretprobe_prog, true /* retprobe */, - SYS_KPROBE_NAME); + SYS_NANOSLEEP_KPROBE_NAME); if (CHECK(IS_ERR(kretprobe_link), "attach_kretprobe", "err %ld\n", PTR_ERR(kretprobe_link))) { kretprobe_link = NULL; diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c index 3f1ef95865ff..3003fddc0613 100644 --- a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c +++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c @@ -5,12 +5,6 @@ #include #include -#ifdef __x86_64__ -#define SYS_KPROBE_NAME "__x64_sys_nanosleep" -#else -#define SYS_KPROBE_NAME "sys_nanosleep" -#endif - static void on_sample(void *ctx, int cpu, void *data, __u32 size) { int cpu_data = *(int *)data, duration = 0; @@ -56,7 +50,7 @@ void test_perf_buffer(void) /* attach kprobe */ link = bpf_program__attach_kprobe(prog, false /* retprobe */, - SYS_KPROBE_NAME); + SYS_NANOSLEEP_KPROBE_NAME); if (CHECK(IS_ERR(link), "attach_kprobe", "err %ld\n", PTR_ERR(link))) goto out_close; diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index f095e1d4c657..49e0f7d85643 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -92,3 +92,11 @@ int compare_map_keys(int map1_fd, int map2_fd); int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len); int extract_build_id(char *build_id, size_t size); void *spin_lock_thread(void *arg); + +#ifdef __x86_64__ +#define SYS_NANOSLEEP_KPROBE_NAME "__x64_sys_nanosleep" +#elif defined(__s390x__) +#define SYS_NANOSLEEP_KPROBE_NAME "__s390x_sys_nanosleep" +#else +#define SYS_NANOSLEEP_KPROBE_NAME "sys_nanosleep" +#endif -- cgit v1.2.3-55-g7522 From bca1eac55a940025065645158c1a3429ac697df6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 16 Jul 2019 16:26:36 -0700 Subject: tools/testing/selftests/proc/proc-pid-vm.c: hide "segfault at ffffffffff600000" dmesg spam Test tries to access vsyscall page and if it doesn't exist gets SIGSEGV which can spam into dmesg. However the segfault happens by design. Handle it and carry information via exit code to parent. Link: http://lkml.kernel.org/r/20190524181256.GA2260@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/proc/proc-pid-vm.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c index 853aa164a401..18a3bde8bc96 100644 --- a/tools/testing/selftests/proc/proc-pid-vm.c +++ b/tools/testing/selftests/proc/proc-pid-vm.c @@ -215,6 +215,11 @@ static const char str_vsyscall[] = "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; #ifdef __x86_64__ +static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___) +{ + _exit(1); +} + /* * vsyscall page can't be unmapped, probe it with memory load. */ @@ -231,11 +236,19 @@ static void vsyscall(void) if (pid == 0) { struct rlimit rlim = {0, 0}; (void)setrlimit(RLIMIT_CORE, &rlim); + + /* Hide "segfault at ffffffffff600000" messages. */ + struct sigaction act; + memset(&act, 0, sizeof(struct sigaction)); + act.sa_flags = SA_SIGINFO; + act.sa_sigaction = sigaction_SIGSEGV; + (void)sigaction(SIGSEGV, &act, NULL); + *(volatile int *)0xffffffffff600000UL; exit(0); } - wait(&wstatus); - if (WIFEXITED(wstatus)) { + waitpid(pid, &wstatus, 0); + if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) { g_vsyscall = true; } } -- cgit v1.2.3-55-g7522 From 7dbbade1f285e881119049563ab2a036c96dd9f3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 16 Jul 2019 16:26:48 -0700 Subject: proc: test /proc/sysvipc vs setns(CLONE_NEWIPC) I thought that /proc/sysvipc has the same bug as /proc/net commit 1fde6f21d90f8ba5da3cb9c54ca991ed72696c43 proc: fix /proc/net/* after setns(2) However, it doesn't! /proc/sysvipc files do get_ipc_ns(current->nsproxy->ipc_ns); in their open() hook and avoid the problem. Keep the test, maybe /proc/sysvipc will become broken someday :-\ Link: http://lkml.kernel.org/r/20190706180146.GA21015@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/proc/.gitignore | 1 + tools/testing/selftests/proc/Makefile | 1 + tools/testing/selftests/proc/setns-sysvipc.c | 133 +++++++++++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 tools/testing/selftests/proc/setns-sysvipc.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 444ad39d3700..66fab4c58ed4 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -12,4 +12,5 @@ /read /self /setns-dcache +/setns-sysvipc /thread-self diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 9f09fcd09ea3..a8ed0f684829 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -17,6 +17,7 @@ TEST_GEN_PROGS += proc-uptime-002 TEST_GEN_PROGS += read TEST_GEN_PROGS += self TEST_GEN_PROGS += setns-dcache +TEST_GEN_PROGS += setns-sysvipc TEST_GEN_PROGS += thread-self include ../lib.mk diff --git a/tools/testing/selftests/proc/setns-sysvipc.c b/tools/testing/selftests/proc/setns-sysvipc.c new file mode 100644 index 000000000000..903890c5e587 --- /dev/null +++ b/tools/testing/selftests/proc/setns-sysvipc.c @@ -0,0 +1,133 @@ +/* + * Copyright © 2019 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* + * Test that setns(CLONE_NEWIPC) points to new /proc/sysvipc content even + * if old one is in dcache. + */ +#undef NDEBUG +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static pid_t pid = -1; + +static void f(void) +{ + if (pid > 0) { + kill(pid, SIGTERM); + } +} + +int main(void) +{ + int fd[2]; + char _ = 0; + int nsfd; + + atexit(f); + + /* Check for priviledges and syscall availability straight away. */ + if (unshare(CLONE_NEWIPC) == -1) { + if (errno == ENOSYS || errno == EPERM) { + return 4; + } + return 1; + } + /* Distinguisher between two otherwise empty IPC namespaces. */ + if (shmget(IPC_PRIVATE, 1, IPC_CREAT) == -1) { + return 1; + } + + if (pipe(fd) == -1) { + return 1; + } + + pid = fork(); + if (pid == -1) { + return 1; + } + + if (pid == 0) { + if (unshare(CLONE_NEWIPC) == -1) { + return 1; + } + + if (write(fd[1], &_, 1) != 1) { + return 1; + } + + pause(); + + return 0; + } + + if (read(fd[0], &_, 1) != 1) { + return 1; + } + + { + char buf[64]; + snprintf(buf, sizeof(buf), "/proc/%u/ns/ipc", pid); + nsfd = open(buf, O_RDONLY); + if (nsfd == -1) { + return 1; + } + } + + /* Reliably pin dentry into dcache. */ + (void)open("/proc/sysvipc/shm", O_RDONLY); + + if (setns(nsfd, CLONE_NEWIPC) == -1) { + return 1; + } + + kill(pid, SIGTERM); + pid = 0; + + { + char buf[4096]; + ssize_t rv; + int fd; + + fd = open("/proc/sysvipc/shm", O_RDONLY); + if (fd == -1) { + return 1; + } + +#define S32 " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n" +#define S64 " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n" + rv = read(fd, buf, sizeof(buf)); + if (rv == strlen(S32)) { + assert(memcmp(buf, S32, strlen(S32)) == 0); + } else if (rv == strlen(S64)) { + assert(memcmp(buf, S64, strlen(S64)) == 0); + } else { + assert(0); + } + } + + return 0; +} -- cgit v1.2.3-55-g7522 From 201766a20e30f982ccfe36bebfad9602c3ff574a Mon Sep 17 00:00:00 2001 From: Elvira Khabirova Date: Tue, 16 Jul 2019 16:29:42 -0700 Subject: ptrace: add PTRACE_GET_SYSCALL_INFO request PTRACE_GET_SYSCALL_INFO is a generic ptrace API that lets ptracer obtain details of the syscall the tracee is blocked in. There are two reasons for a special syscall-related ptrace request. Firstly, with the current ptrace API there are cases when ptracer cannot retrieve necessary information about syscalls. Some examples include: * The notorious int-0x80-from-64-bit-task issue. See [1] for details. In short, if a 64-bit task performs a syscall through int 0x80, its tracer has no reliable means to find out that the syscall was, in fact, a compat syscall, and misidentifies it. * Syscall-enter-stop and syscall-exit-stop look the same for the tracer. Common practice is to keep track of the sequence of ptrace-stops in order not to mix the two syscall-stops up. But it is not as simple as it looks; for example, strace had a (just recently fixed) long-standing bug where attaching strace to a tracee that is performing the execve system call led to the tracer identifying the following syscall-exit-stop as syscall-enter-stop, which messed up all the state tracking. * Since the introduction of commit 84d77d3f06e7 ("ptrace: Don't allow accessing an undumpable mm"), both PTRACE_PEEKDATA and process_vm_readv become unavailable when the process dumpable flag is cleared. On such architectures as ia64 this results in all syscall arguments being unavailable for the tracer. Secondly, ptracers also have to support a lot of arch-specific code for obtaining information about the tracee. For some architectures, this requires a ptrace(PTRACE_PEEKUSER, ...) invocation for every syscall argument and return value. ptrace(2) man page: long ptrace(enum __ptrace_request request, pid_t pid, void *addr, void *data); ... PTRACE_GET_SYSCALL_INFO Retrieve information about the syscall that caused the stop. The information is placed into the buffer pointed by "data" argument, which should be a pointer to a buffer of type "struct ptrace_syscall_info". The "addr" argument contains the size of the buffer pointed to by "data" argument (i.e., sizeof(struct ptrace_syscall_info)). The return value contains the number of bytes available to be written by the kernel. If the size of data to be written by the kernel exceeds the size specified by "addr" argument, the output is truncated. [ldv@altlinux.org: selftests/seccomp/seccomp_bpf: update for PTRACE_GET_SYSCALL_INFO] Link: http://lkml.kernel.org/r/20190708182904.GA12332@altlinux.org Link: http://lkml.kernel.org/r/20190510152842.GF28558@altlinux.org Signed-off-by: Elvira Khabirova Co-developed-by: Dmitry V. Levin Signed-off-by: Dmitry V. Levin Reviewed-by: Oleg Nesterov Reviewed-by: Kees Cook Reviewed-by: Andy Lutomirski Cc: Eugene Syromyatnikov Cc: Benjamin Herrenschmidt Cc: Greentime Hu Cc: Helge Deller [parisc] Cc: James E.J. Bottomley Cc: James Hogan Cc: kbuild test robot Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Richard Kuo Cc: Shuah Khan Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/tracehook.h | 9 ++- include/uapi/linux/ptrace.h | 35 +++++++++ kernel/ptrace.c | 101 +++++++++++++++++++++++++- tools/testing/selftests/seccomp/seccomp_bpf.c | 13 +++- 4 files changed, 150 insertions(+), 8 deletions(-) (limited to 'tools/testing') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 8446573cc682..36fb3bbed6b2 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -54,13 +54,15 @@ struct linux_binprm; /* * ptrace report for syscall entry and exit looks identical. */ -static inline int ptrace_report_syscall(struct pt_regs *regs) +static inline int ptrace_report_syscall(struct pt_regs *regs, + unsigned long message) { int ptrace = current->ptrace; if (!(ptrace & PT_PTRACED)) return 0; + current->ptrace_message = message; ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0)); /* @@ -73,6 +75,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs) current->exit_code = 0; } + current->ptrace_message = 0; return fatal_signal_pending(current); } @@ -98,7 +101,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs) static inline __must_check int tracehook_report_syscall_entry( struct pt_regs *regs) { - return ptrace_report_syscall(regs); + return ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_ENTRY); } /** @@ -123,7 +126,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step) if (step) user_single_step_report(regs); else - ptrace_report_syscall(regs); + ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_EXIT); } /** diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h index d5a1b8a492b9..a71b6e3b03eb 100644 --- a/include/uapi/linux/ptrace.h +++ b/include/uapi/linux/ptrace.h @@ -73,6 +73,41 @@ struct seccomp_metadata { __u64 flags; /* Output: filter's flags */ }; +#define PTRACE_GET_SYSCALL_INFO 0x420e +#define PTRACE_SYSCALL_INFO_NONE 0 +#define PTRACE_SYSCALL_INFO_ENTRY 1 +#define PTRACE_SYSCALL_INFO_EXIT 2 +#define PTRACE_SYSCALL_INFO_SECCOMP 3 + +struct ptrace_syscall_info { + __u8 op; /* PTRACE_SYSCALL_INFO_* */ + __u32 arch __attribute__((__aligned__(sizeof(__u32)))); + __u64 instruction_pointer; + __u64 stack_pointer; + union { + struct { + __u64 nr; + __u64 args[6]; + } entry; + struct { + __s64 rval; + __u8 is_error; + } exit; + struct { + __u64 nr; + __u64 args[6]; + __u32 ret_data; + } seccomp; + }; +}; + +/* + * These values are stored in task->ptrace_message + * by tracehook_report_syscall_* to describe the current syscall-stop. + */ +#define PTRACE_EVENTMSG_SYSCALL_ENTRY 1 +#define PTRACE_EVENTMSG_SYSCALL_EXIT 2 + /* Read signals from a shared (process wide) queue */ #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 83a531cea2f3..cb9ddcc08119 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -32,6 +32,8 @@ #include #include +#include /* for syscall_get_* */ + /* * Access another process' address space via ptrace. * Source/target buffer must be kernel space, @@ -897,7 +899,100 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, * to ensure no machine forgets it. */ EXPORT_SYMBOL_GPL(task_user_regset_view); -#endif + +static unsigned long +ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + unsigned long args[ARRAY_SIZE(info->entry.args)]; + int i; + + info->op = PTRACE_SYSCALL_INFO_ENTRY; + info->entry.nr = syscall_get_nr(child, regs); + syscall_get_arguments(child, regs, args); + for (i = 0; i < ARRAY_SIZE(args); i++) + info->entry.args[i] = args[i]; + + /* args is the last field in struct ptrace_syscall_info.entry */ + return offsetofend(struct ptrace_syscall_info, entry.args); +} + +static unsigned long +ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + /* + * As struct ptrace_syscall_info.entry is currently a subset + * of struct ptrace_syscall_info.seccomp, it makes sense to + * initialize that subset using ptrace_get_syscall_info_entry(). + * This can be reconsidered in the future if these structures + * diverge significantly enough. + */ + ptrace_get_syscall_info_entry(child, regs, info); + info->op = PTRACE_SYSCALL_INFO_SECCOMP; + info->seccomp.ret_data = child->ptrace_message; + + /* ret_data is the last field in struct ptrace_syscall_info.seccomp */ + return offsetofend(struct ptrace_syscall_info, seccomp.ret_data); +} + +static unsigned long +ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + info->op = PTRACE_SYSCALL_INFO_EXIT; + info->exit.rval = syscall_get_error(child, regs); + info->exit.is_error = !!info->exit.rval; + if (!info->exit.is_error) + info->exit.rval = syscall_get_return_value(child, regs); + + /* is_error is the last field in struct ptrace_syscall_info.exit */ + return offsetofend(struct ptrace_syscall_info, exit.is_error); +} + +static int +ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, + void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info = { + .op = PTRACE_SYSCALL_INFO_NONE, + .arch = syscall_get_arch(child), + .instruction_pointer = instruction_pointer(regs), + .stack_pointer = user_stack_pointer(regs), + }; + unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); + unsigned long write_size; + + /* + * This does not need lock_task_sighand() to access + * child->last_siginfo because ptrace_freeze_traced() + * called earlier by ptrace_check_attach() ensures that + * the tracee cannot go away and clear its last_siginfo. + */ + switch (child->last_siginfo ? child->last_siginfo->si_code : 0) { + case SIGTRAP | 0x80: + switch (child->ptrace_message) { + case PTRACE_EVENTMSG_SYSCALL_ENTRY: + actual_size = ptrace_get_syscall_info_entry(child, regs, + &info); + break; + case PTRACE_EVENTMSG_SYSCALL_EXIT: + actual_size = ptrace_get_syscall_info_exit(child, regs, + &info); + break; + } + break; + case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8): + actual_size = ptrace_get_syscall_info_seccomp(child, regs, + &info); + break; + } + + write_size = min(actual_size, user_size); + return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size; +} +#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data) @@ -1114,6 +1209,10 @@ int ptrace_request(struct task_struct *child, long request, ret = __put_user(kiov.iov_len, &uiov->iov_len); break; } + + case PTRACE_GET_SYSCALL_INFO: + ret = ptrace_get_syscall_info(child, addr, datavp); + break; #endif case PTRACE_SECCOMP_GET_FILTER: diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index dc66fe852768..6ef7f16c4cf5 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1775,13 +1775,18 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee, unsigned long msg; static bool entry; - /* Make sure we got an empty message. */ + /* + * The traditional way to tell PTRACE_SYSCALL entry/exit + * is by counting. + */ + entry = !entry; + + /* Make sure we got an appropriate message. */ ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg); EXPECT_EQ(0, ret); - EXPECT_EQ(0, msg); + EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY + : PTRACE_EVENTMSG_SYSCALL_EXIT, msg); - /* The only way to tell PTRACE_SYSCALL entry/exit is by counting. */ - entry = !entry; if (!entry) return; -- cgit v1.2.3-55-g7522 From ac76de555d76b8cc7f8ef231692a3ad9cbd0ce63 Mon Sep 17 00:00:00 2001 From: Dmitry V. Levin Date: Tue, 16 Jul 2019 16:29:46 -0700 Subject: selftests/ptrace: add a test case for PTRACE_GET_SYSCALL_INFO Check whether PTRACE_GET_SYSCALL_INFO semantics implemented in the kernel matches userspace expectations. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20190510152852.GG28558@altlinux.org Signed-off-by: Dmitry V. Levin Acked-by: Shuah Khan Cc: Oleg Nesterov Cc: Andy Lutomirski Cc: Elvira Khabirova Cc: Eugene Syromyatnikov Cc: Benjamin Herrenschmidt Cc: Greentime Hu Cc: Helge Deller [parisc] Cc: James E.J. Bottomley Cc: James Hogan Cc: kbuild test robot Cc: Kees Cook Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Richard Kuo Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/ptrace/.gitignore | 1 + tools/testing/selftests/ptrace/Makefile | 2 +- tools/testing/selftests/ptrace/get_syscall_info.c | 271 ++++++++++++++++++++++ 3 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/ptrace/get_syscall_info.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/ptrace/.gitignore b/tools/testing/selftests/ptrace/.gitignore index b3e59d41fd82..cfcc49a7def7 100644 --- a/tools/testing/selftests/ptrace/.gitignore +++ b/tools/testing/selftests/ptrace/.gitignore @@ -1 +1,2 @@ +get_syscall_info peeksiginfo diff --git a/tools/testing/selftests/ptrace/Makefile b/tools/testing/selftests/ptrace/Makefile index cb21c76a18ca..c0b7f89f0930 100644 --- a/tools/testing/selftests/ptrace/Makefile +++ b/tools/testing/selftests/ptrace/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -iquote../../../../include/uapi -Wall -TEST_GEN_PROGS := peeksiginfo +TEST_GEN_PROGS := get_syscall_info peeksiginfo include ../lib.mk diff --git a/tools/testing/selftests/ptrace/get_syscall_info.c b/tools/testing/selftests/ptrace/get_syscall_info.c new file mode 100644 index 000000000000..5bcd1c7b5be6 --- /dev/null +++ b/tools/testing/selftests/ptrace/get_syscall_info.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (c) 2018 Dmitry V. Levin + * All rights reserved. + * + * Check whether PTRACE_GET_SYSCALL_INFO semantics implemented in the kernel + * matches userspace expectations. + */ + +#include "../kselftest_harness.h" +#include +#include +#include +#include "linux/ptrace.h" + +static int +kill_tracee(pid_t pid) +{ + if (!pid) + return 0; + + int saved_errno = errno; + + int rc = kill(pid, SIGKILL); + + errno = saved_errno; + return rc; +} + +static long +sys_ptrace(int request, pid_t pid, unsigned long addr, unsigned long data) +{ + return syscall(__NR_ptrace, request, pid, addr, data); +} + +#define LOG_KILL_TRACEE(fmt, ...) \ + do { \ + kill_tracee(pid); \ + TH_LOG("wait #%d: " fmt, \ + ptrace_stop, ##__VA_ARGS__); \ + } while (0) + +TEST(get_syscall_info) +{ + static const unsigned long args[][7] = { + /* a sequence of architecture-agnostic syscalls */ + { + __NR_chdir, + (unsigned long) "", + 0xbad1fed1, + 0xbad2fed2, + 0xbad3fed3, + 0xbad4fed4, + 0xbad5fed5 + }, + { + __NR_gettid, + 0xcaf0bea0, + 0xcaf1bea1, + 0xcaf2bea2, + 0xcaf3bea3, + 0xcaf4bea4, + 0xcaf5bea5 + }, + { + __NR_exit_group, + 0, + 0xfac1c0d1, + 0xfac2c0d2, + 0xfac3c0d3, + 0xfac4c0d4, + 0xfac5c0d5 + } + }; + const unsigned long *exp_args; + + pid_t pid = fork(); + + ASSERT_LE(0, pid) { + TH_LOG("fork: %m"); + } + + if (pid == 0) { + /* get the pid before PTRACE_TRACEME */ + pid = getpid(); + ASSERT_EQ(0, sys_ptrace(PTRACE_TRACEME, 0, 0, 0)) { + TH_LOG("PTRACE_TRACEME: %m"); + } + ASSERT_EQ(0, kill(pid, SIGSTOP)) { + /* cannot happen */ + TH_LOG("kill SIGSTOP: %m"); + } + for (unsigned int i = 0; i < ARRAY_SIZE(args); ++i) { + syscall(args[i][0], + args[i][1], args[i][2], args[i][3], + args[i][4], args[i][5], args[i][6]); + } + /* unreachable */ + _exit(1); + } + + const struct { + unsigned int is_error; + int rval; + } *exp_param, exit_param[] = { + { 1, -ENOENT }, /* chdir */ + { 0, pid } /* gettid */ + }; + + unsigned int ptrace_stop; + + for (ptrace_stop = 0; ; ++ptrace_stop) { + struct ptrace_syscall_info info = { + .op = 0xff /* invalid PTRACE_SYSCALL_INFO_* op */ + }; + const size_t size = sizeof(info); + const int expected_none_size = + (void *) &info.entry - (void *) &info; + const int expected_entry_size = + (void *) &info.entry.args[6] - (void *) &info; + const int expected_exit_size = + (void *) (&info.exit.is_error + 1) - + (void *) &info; + int status; + long rc; + + ASSERT_EQ(pid, wait(&status)) { + /* cannot happen */ + LOG_KILL_TRACEE("wait: %m"); + } + if (WIFEXITED(status)) { + pid = 0; /* the tracee is no more */ + ASSERT_EQ(0, WEXITSTATUS(status)); + break; + } + ASSERT_FALSE(WIFSIGNALED(status)) { + pid = 0; /* the tracee is no more */ + LOG_KILL_TRACEE("unexpected signal %u", + WTERMSIG(status)); + } + ASSERT_TRUE(WIFSTOPPED(status)) { + /* cannot happen */ + LOG_KILL_TRACEE("unexpected wait status %#x", status); + } + + switch (WSTOPSIG(status)) { + case SIGSTOP: + ASSERT_EQ(0, ptrace_stop) { + LOG_KILL_TRACEE("unexpected signal stop"); + } + ASSERT_EQ(0, sys_ptrace(PTRACE_SETOPTIONS, pid, 0, + PTRACE_O_TRACESYSGOOD)) { + LOG_KILL_TRACEE("PTRACE_SETOPTIONS: %m"); + } + ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO, + pid, size, + (unsigned long) &info))) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO: %m"); + } + ASSERT_EQ(expected_none_size, rc) { + LOG_KILL_TRACEE("signal stop mismatch"); + } + ASSERT_EQ(PTRACE_SYSCALL_INFO_NONE, info.op) { + LOG_KILL_TRACEE("signal stop mismatch"); + } + ASSERT_TRUE(info.arch) { + LOG_KILL_TRACEE("signal stop mismatch"); + } + ASSERT_TRUE(info.instruction_pointer) { + LOG_KILL_TRACEE("signal stop mismatch"); + } + ASSERT_TRUE(info.stack_pointer) { + LOG_KILL_TRACEE("signal stop mismatch"); + } + break; + + case SIGTRAP | 0x80: + ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO, + pid, size, + (unsigned long) &info))) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO: %m"); + } + switch (ptrace_stop) { + case 1: /* entering chdir */ + case 3: /* entering gettid */ + case 5: /* entering exit_group */ + exp_args = args[ptrace_stop / 2]; + ASSERT_EQ(expected_entry_size, rc) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_EQ(PTRACE_SYSCALL_INFO_ENTRY, info.op) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_TRUE(info.arch) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_TRUE(info.instruction_pointer) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_TRUE(info.stack_pointer) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_EQ(exp_args[0], info.entry.nr) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_EQ(exp_args[1], info.entry.args[0]) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_EQ(exp_args[2], info.entry.args[1]) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_EQ(exp_args[3], info.entry.args[2]) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_EQ(exp_args[4], info.entry.args[3]) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_EQ(exp_args[5], info.entry.args[4]) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + ASSERT_EQ(exp_args[6], info.entry.args[5]) { + LOG_KILL_TRACEE("entry stop mismatch"); + } + break; + case 2: /* exiting chdir */ + case 4: /* exiting gettid */ + exp_param = &exit_param[ptrace_stop / 2 - 1]; + ASSERT_EQ(expected_exit_size, rc) { + LOG_KILL_TRACEE("exit stop mismatch"); + } + ASSERT_EQ(PTRACE_SYSCALL_INFO_EXIT, info.op) { + LOG_KILL_TRACEE("exit stop mismatch"); + } + ASSERT_TRUE(info.arch) { + LOG_KILL_TRACEE("exit stop mismatch"); + } + ASSERT_TRUE(info.instruction_pointer) { + LOG_KILL_TRACEE("exit stop mismatch"); + } + ASSERT_TRUE(info.stack_pointer) { + LOG_KILL_TRACEE("exit stop mismatch"); + } + ASSERT_EQ(exp_param->is_error, + info.exit.is_error) { + LOG_KILL_TRACEE("exit stop mismatch"); + } + ASSERT_EQ(exp_param->rval, info.exit.rval) { + LOG_KILL_TRACEE("exit stop mismatch"); + } + break; + default: + LOG_KILL_TRACEE("unexpected syscall stop"); + abort(); + } + break; + + default: + LOG_KILL_TRACEE("unexpected stop signal %#x", + WSTOPSIG(status)); + abort(); + } + + ASSERT_EQ(0, sys_ptrace(PTRACE_SYSCALL, pid, 0, 0)) { + LOG_KILL_TRACEE("PTRACE_SYSCALL: %m"); + } + } + + ASSERT_EQ(ARRAY_SIZE(args) * 2, ptrace_stop); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3-55-g7522 From adb701d6cfa432f5dbdf28839b5e64291a7ed30b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 17 Jul 2019 14:41:59 -0700 Subject: selftests: add a test case for rp_filter Add a test case to simulate the loopback packet case fixed in the previous patch. This test gets passed after the fix: IPv4 rp_filter tests TEST: rp_filter passes local packets [ OK ] TEST: rp_filter passes loopback packets [ OK ] Cc: David Ahern Signed-off-by: Cong Wang Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_tests.sh | 35 +++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 9457aaeae092..4465fc2dae14 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -9,12 +9,13 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw" +TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter" VERBOSE=0 PAUSE_ON_FAIL=no PAUSE=no IP="ip -netns ns1" +NS_EXEC="ip netns exec ns1" log_test() { @@ -433,6 +434,37 @@ fib_carrier_test() fib_carrier_unicast_test } +fib_rp_filter_test() +{ + echo + echo "IPv4 rp_filter tests" + + setup + + set -e + $IP link set dev lo address 52:54:00:6a:c7:5e + $IP link set dummy0 address 52:54:00:6a:c7:5e + $IP link add dummy1 type dummy + $IP link set dummy1 address 52:54:00:6a:c7:5e + $IP link set dev dummy1 up + $NS_EXEC sysctl -qw net.ipv4.conf.all.rp_filter=1 + $NS_EXEC sysctl -qw net.ipv4.conf.all.accept_local=1 + $NS_EXEC sysctl -qw net.ipv4.conf.all.route_localnet=1 + + $NS_EXEC tc qd add dev dummy1 parent root handle 1: fq_codel + $NS_EXEC tc filter add dev dummy1 parent 1: protocol arp basic action mirred egress redirect dev lo + $NS_EXEC tc filter add dev dummy1 parent 1: protocol ip basic action mirred egress redirect dev lo + set +e + + run_cmd "ip netns exec ns1 ping -I dummy1 -w1 -c1 198.51.100.1" + log_test $? 0 "rp_filter passes local packets" + + run_cmd "ip netns exec ns1 ping -I dummy1 -w1 -c1 127.0.0.1" + log_test $? 0 "rp_filter passes loopback packets" + + cleanup +} + ################################################################################ # Tests on nexthop spec @@ -1557,6 +1589,7 @@ do fib_unreg_test|unregister) fib_unreg_test;; fib_down_test|down) fib_down_test;; fib_carrier_test|carrier) fib_carrier_test;; + fib_rp_filter_test|rp_filter) fib_rp_filter_test;; fib_nexthop_test|nexthop) fib_nexthop_test;; ipv6_route_test|ipv6_rt) ipv6_route_test;; ipv4_route_test|ipv4_rt) ipv4_route_test;; -- cgit v1.2.3-55-g7522 From 01a0f9e4496d9f54e06abb71bf9f56c617ef8c24 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 18 Jul 2019 11:13:35 +0200 Subject: selftests/bpf: fix "valid read map access into a read-only array 1" on s390 This test looks up a 32-bit map element and then loads it using a 64-bit load. This does not work on s390, which is a big-endian machine. Since the point of this test doesn't seem to be loading a smaller value using a larger load, simply use a 32-bit load. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/array_access.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c index bcb83196e459..f3c33e128709 100644 --- a/tools/testing/selftests/bpf/verifier/array_access.c +++ b/tools/testing/selftests/bpf/verifier/array_access.c @@ -226,7 +226,7 @@ BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, .fixup_map_array_ro = { 3 }, -- cgit v1.2.3-55-g7522 From 59fd3486c3dd5678bc2fcac75e14466775465c3e Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 17 Jul 2019 14:26:20 +0200 Subject: selftests/bpf: fix test_xdp_noinline on s390 test_xdp_noinline fails on s390 due to a handful of endianness issues. Use ntohs for parsing eth_proto. Replace bswaps with ntohs/htons. Signed-off-by: Ilya Leoshkevich Acked-by: Vasily Gorbik Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_xdp_noinline.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c index dad8a7e33eaa..e88d7b9d65ab 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c @@ -14,6 +14,7 @@ #include #include #include "bpf_helpers.h" +#include "bpf_endian.h" static __u32 rol32(__u32 word, unsigned int shift) { @@ -305,7 +306,7 @@ bool encap_v6(struct xdp_md *xdp, struct ctl_value *cval, ip6h->nexthdr = IPPROTO_IPV6; ip_suffix = pckt->flow.srcv6[3] ^ pckt->flow.port16[0]; ip6h->payload_len = - __builtin_bswap16(pkt_bytes + sizeof(struct ipv6hdr)); + bpf_htons(pkt_bytes + sizeof(struct ipv6hdr)); ip6h->hop_limit = 4; ip6h->saddr.in6_u.u6_addr32[0] = 1; @@ -322,7 +323,7 @@ bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, struct real_definition *dst, __u32 pkt_bytes) { - __u32 ip_suffix = __builtin_bswap16(pckt->flow.port16[0]); + __u32 ip_suffix = bpf_ntohs(pckt->flow.port16[0]); struct eth_hdr *new_eth; struct eth_hdr *old_eth; __u16 *next_iph_u16; @@ -352,7 +353,7 @@ bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, iph->protocol = IPPROTO_IPIP; iph->check = 0; iph->tos = 1; - iph->tot_len = __builtin_bswap16(pkt_bytes + sizeof(struct iphdr)); + iph->tot_len = bpf_htons(pkt_bytes + sizeof(struct iphdr)); /* don't update iph->daddr, since it will overwrite old eth_proto * and multiple iterations of bpf_prog_run() will fail */ @@ -639,7 +640,7 @@ static int process_l3_headers_v6(struct packet_description *pckt, iph_len = sizeof(struct ipv6hdr); *protocol = ip6h->nexthdr; pckt->flow.proto = *protocol; - *pkt_bytes = __builtin_bswap16(ip6h->payload_len); + *pkt_bytes = bpf_ntohs(ip6h->payload_len); off += iph_len; if (*protocol == 45) { return XDP_DROP; @@ -671,7 +672,7 @@ static int process_l3_headers_v4(struct packet_description *pckt, return XDP_DROP; *protocol = iph->protocol; pckt->flow.proto = *protocol; - *pkt_bytes = __builtin_bswap16(iph->tot_len); + *pkt_bytes = bpf_ntohs(iph->tot_len); off += 20; if (iph->frag_off & 65343) return XDP_DROP; @@ -808,10 +809,10 @@ int balancer_ingress(struct xdp_md *ctx) nh_off = sizeof(struct eth_hdr); if (data + nh_off > data_end) return XDP_DROP; - eth_proto = eth->eth_proto; - if (eth_proto == 8) + eth_proto = bpf_ntohs(eth->eth_proto); + if (eth_proto == ETH_P_IP) return process_packet(data, nh_off, data_end, 0, ctx); - else if (eth_proto == 56710) + else if (eth_proto == ETH_P_IPV6) return process_packet(data, nh_off, data_end, 1, ctx); else return XDP_DROP; -- cgit v1.2.3-55-g7522