[PATCH v4 5/6] arch: introduce memremap_cache() and memremap_wt()

June 11, 2015, 2:30 pm

≫ Next: [PATCH v4 6/6] arch, x86: pmem api for ensuring durability of persistent memory updates

≪ Previous: [PATCH v4 3/6] arch/*/asm/io.h: add ioremap_cache() to all architectures

Existing users of ioremap_cache() are mapping memory that is known in
advance to not have i/o side effects. These users are forced to cast
away the __iomem annotation, or otherwise neglect to fix the sparse
errors thrown when dereferencing pointers to this memory. Provide
memremap_*() as a non __iomem annotated ioremap_*().

The ARCH_HAS_MEMREMAP kconfig symbol is introduced for archs to assert
that it is safe to recast / reuse the return value from ioremap as a
normal pointer to memory. In other words, archs that mandate specific
accessors for __iomem are not memremap() capable and drivers that care,
like pmem, can add a dependency to disable themselves on these archs.

Cc: Arnd Bergmann <arnd@arndb.de>
Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
arch/arm/Kconfig | 1 +
arch/arm64/Kconfig | 1 +
arch/arm64/kernel/efi.c | 4 ++-
arch/arm64/kernel/smp_spin_table.c | 10 ++++----
arch/frv/Kconfig | 1 +
arch/m68k/Kconfig | 1 +
arch/metag/Kconfig | 1 +
arch/mips/Kconfig | 1 +
arch/powerpc/Kconfig | 1 +
arch/x86/Kconfig | 1 +
arch/x86/kernel/crash_dump_64.c | 6 ++---
arch/x86/kernel/kdebugfs.c | 8 +++----
arch/x86/kernel/ksysfs.c | 28 ++++++++++++-----------
arch/x86/mm/ioremap.c | 10 +++-----
arch/xtensa/Kconfig | 1 +
drivers/acpi/apei/einj.c | 8 +++----
drivers/acpi/apei/erst.c | 4 ++-
drivers/block/Kconfig | 1 +
drivers/block/pmem.c | 7 +++---
drivers/firmware/google/memconsole.c | 4 ++-
include/linux/device.h | 5 ++++
include/linux/io.h | 4 +++
kernel/resource.c | 41 +++++++++++++++++++++++++++++++++-
lib/Kconfig | 5 +++-
24 files changed, 107 insertions(+), 47 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 45df48ba0b12..397426f8ca37 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -3,6 +3,7 @@ config ARM
default y
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_ELF_RANDOMIZE
+ select ARCH_HAS_MEMREMAP
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAVE_CUSTOM_GPIO_H
select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7796af4b1d6f..f07a9a5af61e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -5,6 +5,7 @@ config ARM64
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_HAS_MEMREMAP
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_USE_CMPXCHG_LOCKREF
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index ab21e0d58278..b672ef33f08b 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -289,7 +289,7 @@ static int __init arm64_enable_runtime_services(void)
pr_info("Remapping and enabling EFI services.\n");

mapsize = memmap.map_end - memmap.map;
- memmap.map = (__force void *)ioremap_cache((phys_addr_t)memmap.phys_map,
+ memmap.map = memremap_cache((phys_addr_t)memmap.phys_map,
mapsize);
if (!memmap.map) {
pr_err("Failed to remap EFI memory map\n");
@@ -298,7 +298,7 @@ static int __init arm64_enable_runtime_services(void)
memmap.map_end = memmap.map + mapsize;
efi.memmap = &memmap;

- efi.systab = (__force void *)ioremap_cache(efi_system_table,
+ efi.systab = memremap_cache(efi_system_table,
sizeof(efi_system_table_t));
if (!efi.systab) {
pr_err("Failed to remap EFI System Table\n");
diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
index 14944e5b28da..893c8586e20f 100644
--- a/arch/arm64/kernel/smp_spin_table.c
+++ b/arch/arm64/kernel/smp_spin_table.c
@@ -67,18 +67,18 @@ static int smp_spin_table_cpu_init(struct device_node *dn, unsigned int cpu)

static int smp_spin_table_cpu_prepare(unsigned int cpu)
{
- __le64 __iomem *release_addr;
+ __le64 *release_addr;

if (!cpu_release_addr[cpu])
return -ENODEV;

/*
* The cpu-release-addr may or may not be inside the linear mapping.
- * As ioremap_cache will either give us a new mapping or reuse the
+ * As memremap_cache will either give us a new mapping or reuse the
* existing linear mapping, we can use it to cover both cases. In
* either case the memory will be MT_NORMAL.
*/
- release_addr = ioremap_cache(cpu_release_addr[cpu],
+ release_addr = memremap_cache(cpu_release_addr[cpu],
sizeof(*release_addr));
if (!release_addr)
return -ENOMEM;
@@ -91,7 +91,7 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu)
* the boot protocol.
*/
writeq_relaxed(__pa(secondary_holding_pen), release_addr);
- __flush_dcache_area((__force void *)release_addr,
+ __flush_dcache_area(release_addr,
sizeof(*release_addr));

/*
@@ -99,7 +99,7 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu)
*/
sev();

- iounmap(release_addr);
+ memunmap(release_addr);

return 0;
}
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index 34aa19352dc1..2373bf183527 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -14,6 +14,7 @@ config FRV
select OLD_SIGSUSPEND3
select OLD_SIGACTION
select HAVE_DEBUG_STACKOVERFLOW
+ select ARCH_HAS_MEMREMAP

config ZONE_DMA
bool
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 2dd8f63bfbbb..831b1be8c43d 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -23,6 +23,7 @@ config M68K
select MODULES_USE_ELF_RELA
select OLD_SIGSUSPEND3
select OLD_SIGACTION
+ select ARCH_HAS_MEMREMAP

config RWSEM_GENERIC_SPINLOCK
bool
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index 0b389a81c43a..5669fe3eb807 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -24,6 +24,7 @@ config METAG
select HAVE_PERF_EVENTS
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNDERSCORE_SYMBOL_PREFIX
+ select ARCH_HAS_MEMREMAP
select IRQ_DOMAIN
select MODULES_USE_ELF_RELA
select OF
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index f5016656494f..9ee35e615c0d 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -58,6 +58,7 @@ config MIPS
select SYSCTL_EXCEPTION_TRACE
select HAVE_VIRT_CPU_ACCOUNTING_GEN
select HAVE_IRQ_TIME_ACCOUNTING
+ select ARCH_HAS_MEMREMAP

menu "Machine selection"

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 190cc48abc0c..73c1f8b1f022 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -153,6 +153,7 @@ config PPC
select NO_BOOTMEM
select HAVE_GENERIC_RCU_GUP
select HAVE_PERF_EVENTS_NMI if PPC64
+ select ARCH_HAS_MEMREMAP

config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 228aa35d7e89..f16caf7eac27 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@ config X86
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_HAS_MEMREMAP
select ARCH_HAS_SG_CHAIN
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index afa64adb75ee..8e04011665fd 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -31,19 +31,19 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
if (!csize)
return 0;

- vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
+ vaddr = memremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
if (!vaddr)
return -ENOMEM;

if (userbuf) {
if (copy_to_user(buf, vaddr + offset, csize)) {
- iounmap(vaddr);
+ memunmap(vaddr);
return -EFAULT;
}
} else
memcpy(buf, vaddr + offset, csize);

set_iounmap_nonlazy();
- iounmap(vaddr);
+ memunmap(vaddr);
return csize;
}
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index dc1404bf8e4b..731b10e2814f 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -49,7 +49,7 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
pa = node->paddr + sizeof(struct setup_data) + pos;
pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
if (PageHighMem(pg)) {
- p = ioremap_cache(pa, count);
+ p = memremap_cache(pa, count);
if (!p)
return -ENXIO;
} else
@@ -58,7 +58,7 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
remain = copy_to_user(user_buf, p, count);

if (PageHighMem(pg))
- iounmap(p);
+ memunmap(p);

if (remain)
return -EFAULT;
@@ -128,7 +128,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)

pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
if (PageHighMem(pg)) {
- data = ioremap_cache(pa_data, sizeof(*data));
+ data = memremap_cache(pa_data, sizeof(*data));
if (!data) {
kfree(node);
error = -ENXIO;
@@ -144,7 +144,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
pa_data = data->next;

if (PageHighMem(pg))
- iounmap(data);
+ memunmap(data);
if (error)
goto err_dir;
no++;
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index c2bedaea11f7..2fbc62886eae 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -16,8 +16,8 @@
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/io.h>

-#include <asm/io.h>
#include <asm/setup.h>

static ssize_t version_show(struct kobject *kobj,
@@ -79,12 +79,12 @@ static int get_setup_data_paddr(int nr, u64 *paddr)
*paddr = pa_data;
return 0;
}
- data = ioremap_cache(pa_data, sizeof(*data));
+ data = memremap_cache(pa_data, sizeof(*data));
if (!data)
return -ENOMEM;

pa_data = data->next;
- iounmap(data);
+ memunmap(data);
i++;
}
return -EINVAL;
@@ -97,17 +97,17 @@ static int __init get_setup_data_size(int nr, size_t *size)
u64 pa_data = boot_params.hdr.setup_data;

while (pa_data) {
- data = ioremap_cache(pa_data, sizeof(*data));
+ data = memremap_cache(pa_data, sizeof(*data));
if (!data)
return -ENOMEM;
if (nr == i) {
*size = data->len;
- iounmap(data);
+ memunmap(data);
return 0;
}

pa_data = data->next;
- iounmap(data);
+ memunmap(data);
i++;
}
return -EINVAL;
@@ -127,12 +127,12 @@ static ssize_t type_show(struct kobject *kobj,
ret = get_setup_data_paddr(nr, &paddr);
if (ret)
return ret;
- data = ioremap_cache(paddr, sizeof(*data));
+ data = memremap_cache(paddr, sizeof(*data));
if (!data)
return -ENOMEM;

ret = sprintf(buf, "0x%x\n", data->type);
- iounmap(data);
+ memunmap(data);
return ret;
}

@@ -154,7 +154,7 @@ static ssize_t setup_data_data_read(struct file *fp,
ret = get_setup_data_paddr(nr, &paddr);
if (ret)
return ret;
- data = ioremap_cache(paddr, sizeof(*data));
+ data = memremap_cache(paddr, sizeof(*data));
if (!data)
return -ENOMEM;

@@ -170,15 +170,15 @@ static ssize_t setup_data_data_read(struct file *fp,
goto out;

ret = count;
- p = ioremap_cache(paddr + sizeof(*data), data->len);
+ p = memremap_cache(paddr + sizeof(*data), data->len);
if (!p) {
ret = -ENOMEM;
goto out;
}
memcpy(buf, p + off, count);
- iounmap(p);
+ memunmap(p);
out:
- iounmap(data);
+ memunmap(data);
return ret;
}

@@ -250,13 +250,13 @@ static int __init get_setup_data_total_num(u64 pa_data, int *nr)
*nr = 0;
while (pa_data) {
*nr += 1;
- data = ioremap_cache(pa_data, sizeof(*data));
+ data = memremap_cache(pa_data, sizeof(*data));
if (!data) {
ret = -ENOMEM;
goto out;
}
pa_data = data->next;
- iounmap(data);
+ memunmap(data);
}

out:
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index cc5ccc415cc0..7f087cb793fa 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -414,12 +414,10 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
if (page_is_ram(start >> PAGE_SHIFT))
return __va(phys);

- vaddr = ioremap_cache(start, PAGE_SIZE);
- /* Only add the offset on success and return NULL if the ioremap() failed: */
+ vaddr = memremap_cache(start, PAGE_SIZE);
if (vaddr)
- vaddr += offset;
-
- return vaddr;
+ return vaddr + offset;
+ return NULL;
}

void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
@@ -427,7 +425,7 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
if (page_is_ram(phys >> PAGE_SHIFT))
return;

- iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+ memunmap((void *)((unsigned long)addr & PAGE_MASK));
}

static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 87be10e8b57a..e601faf87cee 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -3,6 +3,7 @@ config ZONE_DMA

config XTENSA
def_bool y
+ select ARCH_HAS_MEMREMAP
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_IPC_PARSE_VERSION
select ARCH_WANT_OPTIONAL_GPIOLIB
diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
index a095d4f858da..2ec9006cfb6c 100644
--- a/drivers/acpi/apei/einj.c
+++ b/drivers/acpi/apei/einj.c
@@ -318,7 +318,7 @@ static int __einj_error_trigger(u64 trigger_paddr, u32 type,
sizeof(*trigger_tab) - 1);
goto out;
}
- trigger_tab = ioremap_cache(trigger_paddr, sizeof(*trigger_tab));
+ trigger_tab = memremap_cache(trigger_paddr, sizeof(*trigger_tab));
if (!trigger_tab) {
pr_err(EINJ_PFX "Failed to map trigger table!\n");
goto out_rel_header;
@@ -346,8 +346,8 @@ static int __einj_error_trigger(u64 trigger_paddr, u32 type,
(unsigned long long)trigger_paddr + table_size - 1);
goto out_rel_header;
}
- iounmap(trigger_tab);
- trigger_tab = ioremap_cache(trigger_paddr, table_size);
+ memunmap(trigger_tab);
+ trigger_tab = memremap_cache(trigger_paddr, table_size);
if (!trigger_tab) {
pr_err(EINJ_PFX "Failed to map trigger table!\n");
goto out_rel_entry;
@@ -409,7 +409,7 @@ out_rel_header:
release_mem_region(trigger_paddr, sizeof(*trigger_tab));
out:
if (trigger_tab)
- iounmap(trigger_tab);
+ memunmap(trigger_tab);

return rc;
}
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 3670bbab57a3..6b95066da51d 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -77,7 +77,7 @@ static struct acpi_table_erst *erst_tab;
static struct erst_erange {
u64 base;
u64 size;
- void __iomem *vaddr;
+ void *vaddr;
u32 attr;
} erst_erange;

@@ -1185,7 +1185,7 @@ static int __init erst_init(void)
goto err_unmap_reg;
}
rc = -ENOMEM;
- erst_erange.vaddr = ioremap_cache(erst_erange.base,
+ erst_erange.vaddr = memremap_cache(erst_erange.base,
erst_erange.size);
if (!erst_erange.vaddr)
goto err_release_erange;
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index eb1fed5bd516..98418fc330ae 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -406,6 +406,7 @@ config BLK_DEV_RAM_DAX

config BLK_DEV_PMEM
tristate "Persistent memory block device support"
+ depends on ARCH_HAS_MEMREMAP
help
Saying Y here will allow you to use a contiguous range of reserved
memory as one or more persistent block devices.
diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index 095dfaadcaa5..b00b97314b57 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -23,6 +23,7 @@
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/slab.h>
+#include <linux/io.h>

#define PMEM_MINORS 16

@@ -143,7 +144,7 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
* of the CPU caches in case of a crash.
*/
err = -ENOMEM;
- pmem->virt_addr = ioremap_wt(pmem->phys_addr, pmem->size);
+ pmem->virt_addr = memremap_wt(pmem->phys_addr, pmem->size);
if (!pmem->virt_addr)
goto out_release_region;

@@ -179,7 +180,7 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
out_free_queue:
blk_cleanup_queue(pmem->pmem_queue);
out_unmap:
- iounmap(pmem->virt_addr);
+ memunmap(pmem->virt_addr);
out_release_region:
release_mem_region(pmem->phys_addr, pmem->size);
out_free_dev:
@@ -193,7 +194,7 @@ static void pmem_free(struct pmem_device *pmem)
del_gendisk(pmem->pmem_disk);
put_disk(pmem->pmem_disk);
blk_cleanup_queue(pmem->pmem_queue);
- iounmap(pmem->virt_addr);
+ memunmap(pmem->virt_addr);
release_mem_region(pmem->phys_addr, pmem->size);
kfree(pmem);
}
diff --git a/drivers/firmware/google/memconsole.c b/drivers/firmware/google/memconsole.c
index 2f569aaed4c7..877433dc8297 100644
--- a/drivers/firmware/google/memconsole.c
+++ b/drivers/firmware/google/memconsole.c
@@ -52,14 +52,14 @@ static ssize_t memconsole_read(struct file *filp, struct kobject *kobp,
char *memconsole;
ssize_t ret;

- memconsole = ioremap_cache(memconsole_baseaddr, memconsole_length);
+ memconsole = memremap_cache(memconsole_baseaddr, memconsole_length);
if (!memconsole) {
pr_err("memconsole: ioremap_cache failed\n");
return -ENOMEM;
}
ret = memory_read_from_buffer(buf, count, &pos, memconsole,
memconsole_length);
- iounmap(memconsole);
+ memunmap(memconsole);
return ret;
}

diff --git a/include/linux/device.h b/include/linux/device.h
index 6558af90c8fe..518f49c5d596 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -638,6 +638,11 @@ extern void devm_free_pages(struct device *dev, unsigned long addr);

void __iomem *devm_ioremap_resource(struct device *dev, struct resource *res);

+static inline void *devm_memremap_resource(struct device *dev, struct resource *res)
+{
+ return (void __force *) devm_ioremap_resource(dev, res);
+}
+
/* allows to add/remove a custom action to devres stack */
int devm_add_action(struct device *dev, void (*action)(void *), void *data);
void devm_remove_action(struct device *dev, void (*action)(void *), void *data);
diff --git a/include/linux/io.h b/include/linux/io.h
index 4cd8996cfea0..23ef87d311d5 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -123,4 +123,8 @@ static inline int arch_phys_wc_index(int handle)
#endif
#endif

+extern void *memremap_cache(resource_size_t offset, size_t size);
+extern void *memremap_wt(resource_size_t offset, size_t size);
+extern void memunmap(void *addr);
+
#endif /* _LINUX_IO_H */
diff --git a/kernel/resource.c b/kernel/resource.c
index 90552aab5f2d..2f8aca09da52 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -23,7 +23,7 @@
#include <linux/pfn.h>
#include <linux/mm.h>
#include <linux/resource_ext.h>
-#include <asm/io.h>
+#include <linux/io.h>

struct resource ioport_resource = {
@@ -528,6 +528,45 @@ int region_is_ram(resource_size_t start, unsigned long size)
return ret;
}

+#ifdef CONFIG_ARCH_HAS_MEMREMAP
+/*
+ * memremap() is "ioremap" for cases where it is known that the resource
+ * being mapped does not have i/o side effects and the __iomem
+ * annotation is not applicable.
+ */
+static bool memremap_valid(resource_size_t offset, size_t size)
+{
+ if (region_is_ram(offset, size) != 0) {
+ WARN_ONCE(1, "memremap attempted on ram %pa size: %zu\n",
+ &offset, size);
+ return false;
+ }
+ return true;
+}
+
+void *memremap_cache(resource_size_t offset, size_t size)
+{
+ if (!memremap_valid(offset, size))
+ return NULL;
+ return (void __force *) ioremap_cache(offset, size);
+}
+EXPORT_SYMBOL(memremap_cache);
+
+void *memremap_wt(resource_size_t offset, size_t size)
+{
+ if (!memremap_valid(offset, size))
+ return NULL;
+ return (void __force *) ioremap_wt(offset, size);
+}
+EXPORT_SYMBOL(memremap_wt);
+
+void memunmap(void *addr)
+{
+ iounmap((void __iomem __force *) addr);
+}
+EXPORT_SYMBOL(memunmap);
+#endif /* CONFIG_ARCH_HAS_MEMREMAP */
+
void __weak arch_remove_reservations(struct resource *avail)
{
}
diff --git a/lib/Kconfig b/lib/Kconfig
index 601965a948e8..bc7bc0278921 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -520,6 +520,9 @@ source "lib/fonts/Kconfig"
#

config ARCH_HAS_SG_CHAIN
- def_bool n
+ bool
+
+config ARCH_HAS_MEMREMAP
+ bool

endmenu

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

[PATCH v4 6/6] arch, x86: pmem api for ensuring durability of persistent memory updates

June 11, 2015, 2:30 pm

≫ Next: [PATCH v4 4/6] devm: fix ioremap_cache() usage

≪ Previous: [PATCH v4 5/6] arch: introduce memremap_cache() and memremap_wt()

From: Ross Zwisler <ross.zwisler@linux.intel.com>

Based on an original patch by Ross Zwisler [1].

Writes to persistent memory have the potential to be posted to cpu
cache, cpu write buffers, and platform write buffers (memory controller)
before being committed to persistent media. Provide apis,
memcpy_to_pmem(), sync_pmem(), and memremap_pmem(), to write data to
pmem and assert that it is durable in PMEM (a persistent linear address
range). A '__pmem' attribute is added so sparse can track proper usage
of pointers to pmem.

[1]: https://lists.01.org/pipermail/linux-nvdimm/2015-May/000932.html

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
[djbw: various reworks]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
arch/x86/Kconfig | 1
arch/x86/include/asm/cacheflush.h | 36 +++++++++++++
arch/x86/include/asm/io.h | 6 ++
drivers/block/pmem.c | 75 +++++++++++++++++++++++++--
include/linux/compiler.h | 2 +
include/linux/pmem.h | 102 +++++++++++++++++++++++++++++++++++++
lib/Kconfig | 3 +
7 files changed, 218 insertions(+), 7 deletions(-)
create mode 100644 include/linux/pmem.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f16caf7eac27..5dfb8f31ac48 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,7 @@ config X86
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_MEMREMAP
+ select ARCH_HAS_PMEM_API
select ARCH_HAS_SG_CHAIN
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index b6f7457d12e4..4d896487382c 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -4,6 +4,7 @@
/* Caches aren't brain-dead on the intel. */
#include <asm-generic/cacheflush.h>
#include <asm/special_insns.h>
+#include <asm/uaccess.h>

/*
* The set_memory_* API can be used to change various attributes of a virtual
@@ -108,4 +109,39 @@ static inline int rodata_test(void)
}
#endif

+#ifdef ARCH_HAS_NOCACHE_UACCESS
+static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n)
+{
+ /*
+ * We are copying between two kernel buffers, if
+ * __copy_from_user_inatomic_nocache() returns an error (page
+ * fault) we would have already taken an unhandled fault before
+ * the BUG_ON. The BUG_ON is simply here to satisfy
+ * __must_check and allow reuse of the common non-temporal store
+ * implementation for memcpy_to_pmem().
+ */
+ BUG_ON(__copy_from_user_inatomic_nocache((void __force *) dst,
+ (void __user *) src, n));
+}
+
+static inline void arch_sync_pmem(void)
+{
+ wmb();
+ pcommit_sfence();
+}
+
+static inline bool __arch_has_sync_pmem(void)
+{
+ return boot_cpu_has(X86_FEATURE_PCOMMIT);
+}
+#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */
+extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n);
+extern void arch_sync_pmem(void);
+
+static inline bool __arch_has_sync_pmem(void)
+{
+ return false;
+}
+#endif
+
#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index e9d6691ec4c5..0a494ac22a8e 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -249,6 +249,12 @@ static inline void flush_write_buffers(void)
#endif
}

+static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
+ unsigned long size)
+{
+ return (void __force __pmem *) ioremap_cache(offset, size);
+}
+
#endif /* __KERNEL__ */

extern void native_io_delay(void);
diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index b00b97314b57..81090f61b8b1 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -23,23 +23,79 @@
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/slab.h>
+#include <linux/pmem.h>
#include <linux/io.h>

#define PMEM_MINORS 16

+struct pmem_ops {
+ void __pmem *(*remap)(resource_size_t offset, unsigned long size);
+ void (*copy)(void __pmem *dst, const void *src, size_t size);
+ void (*sync)(void);
+};
+
struct pmem_device {
struct request_queue *pmem_queue;
struct gendisk *pmem_disk;

/* One contiguous memory region per device */
phys_addr_t phys_addr;
- void *virt_addr;
+ void __pmem *virt_addr;
size_t size;
+ struct pmem_ops ops;
};

static int pmem_major;
static atomic_t pmem_index;

+static void default_sync_pmem(void)
+{
+ wmb();
+}
+
+static void default_memcpy_to_pmem(void __pmem *dst, const void *src, size_t size)
+{
+ memcpy((void __force *) dst, src, size);
+}
+
+static void __pmem *default_memremap_pmem(resource_size_t offset, unsigned long size)
+{
+ return (void __pmem *)memremap_wt(offset, size);
+}
+
+static void pmem_ops_default_init(struct pmem_device *pmem)
+{
+ /*
+ * These defaults seek to offer decent performance and minimize
+ * the window between i/o completion and writes being durable on
+ * media. However, it is undefined / architecture specific
+ * whether default_memcpy_to_pmem + default_pmem_sync is
+ * sufficient for making data durable relative to i/o
+ * completion.
+ */
+ pmem->ops.remap = default_memremap_pmem;
+ pmem->ops.copy = default_memcpy_to_pmem;
+ pmem->ops.sync = default_sync_pmem;
+}
+
+static bool pmem_ops_init(struct pmem_device *pmem)
+{
+ if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) &&
+ arch_has_sync_pmem()) {
+ /*
+ * This arch + cpu guarantees that bio_endio() == data
+ * durable on media.
+ */
+ pmem->ops.remap = memremap_pmem;
+ pmem->ops.copy = memcpy_to_pmem;
+ pmem->ops.sync = sync_pmem;
+ return true;
+ }
+
+ pmem_ops_default_init(pmem);
+ return false;
+}
+
static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
unsigned int len, unsigned int off, int rw,
sector_t sector)
@@ -48,11 +104,11 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
size_t pmem_off = sector << 9;

if (rw == READ) {
- memcpy(mem + off, pmem->virt_addr + pmem_off, len);
+ memcpy_from_pmem(mem + off, pmem->virt_addr + pmem_off, len);
flush_dcache_page(page);
} else {
flush_dcache_page(page);
- memcpy(pmem->virt_addr + pmem_off, mem + off, len);
+ pmem->ops.copy(pmem->virt_addr + pmem_off, mem + off, len);
}

kunmap_atomic(mem);
@@ -83,6 +139,8 @@ static void pmem_make_request(struct request_queue *q, struct bio *bio)
sector += bvec.bv_len >> 9;
}

+ if (rw)
+ pmem->ops.sync();
out:
bio_endio(bio, err);
}
@@ -107,7 +165,8 @@ static long pmem_direct_access(struct block_device *bdev, sector_t sector,
if (!pmem)
return -ENODEV;

- *kaddr = pmem->virt_addr + offset;
+ /* FIXME convert DAX to comprehend that this mapping has a lifetime */
+ *kaddr = (void __force *) pmem->virt_addr + offset;
*pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;

return pmem->size - offset;
@@ -132,6 +191,8 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)

pmem->phys_addr = res->start;
pmem->size = resource_size(res);
+ if (!pmem_ops_init(pmem))
+ dev_warn(dev, "unable to guarantee persistence of writes\n");

err = -EINVAL;
if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) {
@@ -144,7 +205,7 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
* of the CPU caches in case of a crash.
*/
err = -ENOMEM;
- pmem->virt_addr = memremap_wt(pmem->phys_addr, pmem->size);
+ pmem->virt_addr = pmem->ops.remap(pmem->phys_addr, pmem->size);
if (!pmem->virt_addr)
goto out_release_region;

@@ -180,7 +241,7 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
out_free_queue:
blk_cleanup_queue(pmem->pmem_queue);
out_unmap:
- memunmap(pmem->virt_addr);
+ memunmap_pmem(pmem->virt_addr);
out_release_region:
release_mem_region(pmem->phys_addr, pmem->size);
out_free_dev:
@@ -194,7 +255,7 @@ static void pmem_free(struct pmem_device *pmem)
del_gendisk(pmem->pmem_disk);
put_disk(pmem->pmem_disk);
blk_cleanup_queue(pmem->pmem_queue);
- memunmap(pmem->virt_addr);
+ memunmap_pmem(pmem->virt_addr);
release_mem_region(pmem->phys_addr, pmem->size);
kfree(pmem);
}
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 05be2352fef8..26fc8bc77f85 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -21,6 +21,7 @@
# define __rcu __attribute__((noderef, address_space(4)))
#else
# define __rcu
+# define __pmem __attribute__((noderef, address_space(5)))
#endif
extern void __chk_user_ptr(const volatile void __user *);
extern void __chk_io_ptr(const volatile void __iomem *);
@@ -42,6 +43,7 @@ extern void __chk_io_ptr(const volatile void __iomem *);
# define __cond_lock(x,c) (c)
# define __percpu
# define __rcu
+# define __pmem
#endif

/* Indirect macros required for expanded argument pasting, eg. __LINE__. */
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
new file mode 100644
index 000000000000..0fad4ad714cc
--- /dev/null
+++ b/include/linux/pmem.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef __PMEM_H__
+#define __PMEM_H__
+
+#include <linux/io.h>
+#include <asm/cacheflush.h>
+
+/*
+ * Architectures that define ARCH_HAS_PMEM_API must provide
+ * implementations for arch_memremap_pmem(), arch_memcpy_to_pmem(),
+ * arch_sync_pmem(), and __arch_has_sync_pmem().
+ */
+
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+/**
+ * memremap_pmem - map physical persistent memory for pmem api
+ * @offset: physical address of persistent memory
+ * @size: size of the mapping
+ *
+ * Establish a mapping of the architecture specific memory type expected
+ * by memcpy_to_pmem() and sync_pmem(). For example, it may be
+ * the case that an uncacheable or writethrough mapping is sufficient,
+ * or a writeback mapping provided memcpy_to_pmem() and
+ * sync_pmem() arrange for the data to be written through the
+ * cache to persistent media.
+ */
+static inline void __pmem *memremap_pmem(resource_size_t offset, unsigned long size)
+{
+ return arch_memremap_pmem(offset, size);
+}
+
+/**
+ * memcpy_to_pmem - copy data to persistent memory
+ * @dst: destination buffer for the copy
+ * @src: source buffer for the copy
+ * @n: length of the copy in bytes
+ *
+ * Perform a memory copy that results in the destination of the copy
+ * being effectively evicted from, or never written to, the processor
+ * cache hierarchy after the copy completes. After memcpy_to_pmem()
+ * data may still reside in cpu or platform buffers, so this operation
+ * must be followed by a sync_pmem().
+ */
+static inline void memcpy_to_pmem(void __pmem *dst, const void *src, size_t n)
+{
+ arch_memcpy_to_pmem(dst, src, n);
+}
+
+/**
+ * sync_pmem - synchronize writes to persistent memory
+ *
+ * After a series of memcpy_to_pmem() operations this drains data from
+ * cpu write buffers and any platform (memory controller) buffers to
+ * ensure that written data is durable on persistent memory media.
+ */
+static inline void sync_pmem(void)
+{
+ arch_sync_pmem();
+}
+
+/**
+ * arch_has_sync_pmem - true if sync_pmem() ensures durability
+ *
+ * For a given cpu implementation within an architecture it is possible
+ * that sync_pmem() resolves to a nop. In the case this returns
+ * false, pmem api users are unable to ensure durabilty and may want to
+ * fall back to a different data consistency model, or otherwise notify
+ * the user.
+ */
+static inline bool arch_has_sync_pmem(void)
+{
+ return __arch_has_sync_pmem();
+}
+#else
+/* undefined symbols */
+extern void __pmem *memremap_pmem(resource_size_t offet, unsigned long size);
+extern void memcpy_to_pmem(void __pmem *dst, const void *src, size_t n);
+extern void sync_pmem(void);
+extern bool arch_has_sync_pmem(void);
+#endif /* CONFIG_ARCH_HAS_PMEM_API */
+
+static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size)
+{
+ memcpy(dst, (void __force const *) src, size);
+}
+
+static inline void memunmap_pmem(void __pmem *addr)
+{
+ memunmap((void __force *) addr);
+}
+#endif /* __PMEM_H__ */
diff --git a/lib/Kconfig b/lib/Kconfig
index bc7bc0278921..0d28cc560c6b 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -525,4 +525,7 @@ config ARCH_HAS_SG_CHAIN
config ARCH_HAS_MEMREMAP
bool

+config ARCH_HAS_PMEM_API
+ bool
+
endmenu

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

[PATCH v4 4/6] devm: fix ioremap_cache() usage

June 11, 2015, 2:30 pm

≫ Next: Re: [PATCH v2] powerpc: support sizes greater than an unsigned long

≪ Previous: [PATCH v4 6/6] arch, x86: pmem api for ensuring durability of persistent memory updates

Provide devm_ioremap_cache() and fix up devm_ioremap_resource() to
actually provide cacheable mappings. On archs that implement
ioremap_cache() devm_ioremap_resource() is always silently falling back
to uncached when IORESOURCE_CACHEABLE is specified.

Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
include/linux/io.h | 2 ++
lib/devres.c | 53 +++++++++++++++++++++++++---------------------------
2 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/include/linux/io.h b/include/linux/io.h
index fb5a99800e77..4cd8996cfea0 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -71,6 +71,8 @@ static inline void devm_ioport_unmap(struct device *dev, void __iomem *addr)

void __iomem *devm_ioremap(struct device *dev, resource_size_t offset,
resource_size_t size);
+void __iomem *devm_ioremap_cache(struct device *dev, resource_size_t offset,
+ resource_size_t size);
void __iomem *devm_ioremap_nocache(struct device *dev, resource_size_t offset,
resource_size_t size);
void __iomem *devm_ioremap_wc(struct device *dev, resource_size_t offset,
diff --git a/lib/devres.c b/lib/devres.c
index f4001d90d24d..c8e75cdaf816 100644
--- a/lib/devres.c
+++ b/lib/devres.c
@@ -14,6 +14,8 @@ static int devm_ioremap_match(struct device *dev, void *res, void *match_data)
return *(void **)res == match_data;
}

+typedef void __iomem *(*ioremap_fn)(resource_size_t offset, unsigned long size);
+
/**
* devm_ioremap - Managed ioremap()
* @dev: Generic device to remap IO address for
@@ -22,8 +24,9 @@ static int devm_ioremap_match(struct device *dev, void *res, void *match_data)
*
* Managed ioremap(). Map is automatically unmapped on driver detach.
*/
-void __iomem *devm_ioremap(struct device *dev, resource_size_t offset,
- resource_size_t size)
+static void __iomem *devm_ioremap_type(struct device *dev,
+ resource_size_t offset, resource_size_t size,
+ ioremap_fn ioremap_type)
{
void __iomem **ptr, *addr;

@@ -31,7 +34,7 @@ void __iomem *devm_ioremap(struct device *dev, resource_size_t offset,
if (!ptr)
return NULL;

- addr = ioremap(offset, size);
+ addr = ioremap_type(offset, size);
if (addr) {
*ptr = addr;
devres_add(dev, ptr);
@@ -40,34 +43,25 @@ void __iomem *devm_ioremap(struct device *dev, resource_size_t offset,

return addr;
}
+
+void __iomem *devm_ioremap(struct device *dev, resource_size_t offset,
+ resource_size_t size)
+{
+ return devm_ioremap_type(dev, offset, size, ioremap);
+}
EXPORT_SYMBOL(devm_ioremap);

-/**
- * devm_ioremap_nocache - Managed ioremap_nocache()
- * @dev: Generic device to remap IO address for
- * @offset: BUS offset to map
- * @size: Size of map
- *
- * Managed ioremap_nocache(). Map is automatically unmapped on driver
- * detach.
- */
+void __iomem *devm_ioremap_cache(struct device *dev, resource_size_t offset,
+ resource_size_t size)
+{
+ return devm_ioremap_type(dev, offset, size, ioremap_cache);
+}
+EXPORT_SYMBOL(devm_ioremap_cache);
+
void __iomem *devm_ioremap_nocache(struct device *dev, resource_size_t offset,
resource_size_t size)
{
- void __iomem **ptr, *addr;
-
- ptr = devres_alloc(devm_ioremap_release, sizeof(*ptr), GFP_KERNEL);
- if (!ptr)
- return NULL;
-
- addr = ioremap_nocache(offset, size);
- if (addr) {
- *ptr = addr;
- devres_add(dev, ptr);
- } else
- devres_free(ptr);
-
- return addr;
+ return devm_ioremap_type(dev, offset, size, ioremap_nocache);
}
EXPORT_SYMBOL(devm_ioremap_nocache);

@@ -153,8 +147,11 @@ void __iomem *devm_ioremap_resource(struct device *dev, struct resource *res)
return IOMEM_ERR_PTR(-EBUSY);
}

- /* FIXME: add devm_ioremap_cache support */
- dest_ptr = devm_ioremap(dev, res->start, size);
+ if (res->flags & IORESOURCE_CACHEABLE)
+ dest_ptr = devm_ioremap_cache(dev, res->start, size);
+ else
+ dest_ptr = devm_ioremap_nocache(dev, res->start, size);
+
if (!dest_ptr) {
dev_err(dev, "ioremap failed for resource %pR\n", res);
devm_release_mem_region(dev, res->start, size);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

Re: [PATCH v2] powerpc: support sizes greater than an unsigned long

June 11, 2015, 2:30 pm

≫ Next: [PATCH v2] selinux: reduce locking overhead in inode_free_security()

≪ Previous: [PATCH v4 4/6] devm: fix ioremap_cache() usage

On Thu, 2015-06-11 at 19:10 +0300, Cristian Stoica wrote:
> On 06/11/2015 06:38 PM, Greg KH wrote:
> > On Thu, Jun 11, 2015 at 05:42:00PM +0300, Cristian Stoica wrote:
> >
> > Why?
> >
>
> This patch matches the input argument "size" of ioremap() with the
> output of request_mem_region() (which is
> resource_size_t).
> Since the latter is used as input to the former, the types should
> match (even though mapping more than 4G is not usually
> expected). There are a lot of such differences in the code and this
> is an attempt to reduce that.

Dropping the upper bits of the size harms the ability to detect error
scenarios where unmappably large -- but not power-of-two -- regions
are requested to be mapped.

However, this patch doesn't fix that. It just postpones the loss of
the upper 32 bits until __ioremap_caller() calls get_vm_area_caller().

There's also no error checking at all for the size of ioremap() done
during early boot (!slab_is_available()).

Don't just blindly turn static analyzer reports into patches -- and
why didn't the analyzer complain about the call to
get_vm_area_caller() after this patch?

-Scott

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

[PATCH v2] selinux: reduce locking overhead in inode_free_security()

June 11, 2015, 2:40 pm

≫ Next: Re: High performance HAProxy

≪ Previous: Re: [PATCH v2] powerpc: support sizes greater than an unsigned long

The inode_free_security() function just took the superblock's isec_lock
before checking and trying to remove the inode security struct from the
linked list. In many cases, the list was empty and so the lock taking
is wasteful as no useful work is done. On multi-socket systems with
a large number of CPUs, there can also be a fair amount of spinlock
contention on the isec_lock if many tasks are exiting at the same time.

This patch changes the code to check the state of the list first
before taking the lock and attempting to dequeue it. As this function
is called indirectly from __destroy_inode(), there can't be another
instance of inode_free_security() running on the same inode.

Signed-off-by: Waiman Long <Waiman.Long@hp.com>
---
security/selinux/hooks.c | 15 ++++++++++++---
1 files changed, 12 insertions(+), 3 deletions(-)

v1->v2:
- Take out the second list_empty() test inside the lock.

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7dade28..e5cdad7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -254,10 +254,19 @@ static void inode_free_security(struct inode *inode)
struct inode_security_struct *isec = inode->i_security;
struct superblock_security_struct *sbsec = inode->i_sb->s_security;

- spin_lock(&sbsec->isec_lock);
- if (!list_empty(&isec->list))
+ /*
+ * As not all inode security structures are in a list, we check for
+ * empty list outside of the lock to make sure that we won't waste
+ * time taking a lock doing nothing. As inode_free_security() is
+ * being called indirectly from __destroy_inode(), there is no way
+ * there can be two or more concurrent calls. So doing the list_empty()
+ * test outside the loop should be safe.
+ */
+ if (!list_empty(&isec->list)) {
+ spin_lock(&sbsec->isec_lock);
list_del_init(&isec->list);
- spin_unlock(&sbsec->isec_lock);
+ spin_unlock(&sbsec->isec_lock);
+ }

/*
* The inode may still be referenced in a path walk and
--
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

Re: High performance HAProxy

June 11, 2015, 2:50 pm

≫ Next: Re: [RFC PATCH v2] arm DMA: Fix allocation from CMA for coherent DMA

≪ Previous: [PATCH v2] selinux: reduce locking overhead in inode_free_security()

Hi Willy,

Many thanks for your time and all the suggestions. This looks great.

I imagine we are going to try those and experiment for the next few days. I guess we'll hold off on the process binding in ssl_termination (2. below)) for now as we were experimenting on Debian 7 (which has slightly older kernel), apples-to-apples thing. Or, rather if I could say - we'll keep that for dessert :)

Will update on the results.

Thanks,
Eduard

2) you didn't specify any process binding in ssl_termination, so the
kernel wakes all processes with incoming connections, and a few of
them take some and the other ones go back to sleep. With a kernel
3.9 or later, you can multiply the "bind" lines and bind each of them
to a different process. The load will be much better distributed :

listen ssl_termination
bind 0.0.0.0:443 process 1 ssl crt /webapps/ssl/haproxy.new.crt ciphers AES-128-CBC:HIGH:!MD5:!aNULL:!eNULL:!NULL:!DH:!EDH:!AESGCM no-ssl3
bind 0.0.0.0:443 process 2 ssl crt /webapps/ssl/haproxy.new.crt ciphers AES-128-CBC:HIGH:!MD5:!aNULL:!eNULL:!NULL:!DH:!EDH:!AESGCM no-ssl3
...

↧

Re: [RFC PATCH v2] arm DMA: Fix allocation from CMA for coherent DMA

June 11, 2015, 2:50 pm

≫ Next: Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()

≪ Previous: Re: High performance HAProxy

On Thu, Jun 11, 2015 at 4:26 PM, Catalin Marinas
<catalin.marinas@arm.com> wrote:
> On Wed, Jun 10, 2015 at 09:34:43PM +0200, Lorenzo Nava wrote:
>> On Wed, Jun 10, 2015 at 6:28 PM, Catalin Marinas
>> <catalin.marinas@arm.com> wrote:
>> > On Wed, Jun 03, 2015 at 07:15:45PM +0200, Lorenzo Nava wrote:
>> > > This patch allows the use of CMA for DMA coherent memory allocation.
>> > > At the moment if the input parameter "is_coherent" is set to true
>> > > the allocation is not made using the CMA, which I think is not the
>> > > desired behaviour.
>> > >
>> > > Signed-off-by: Lorenzo Nava <lorenx4@xxxxxxxx>
> [...]
>> > So while you allow __alloc_from_contiguous() to be called when
>> > is_coherent, the memory returned is still non-cacheable. The reason is
>> > that the "prot" argument passed to __dma_alloc() in
>> > arm_coherent_dma_alloc() is pgprot_dmacoherent(PAGE_KERNEL) which means
>> > Normal NonCacheable memory. The mmap seems to create a cacheable mapping
>> > as vma->vm_page_prot is not passed through __get_dma_pgprot().
> [...]
>> Well the final scope of this patch is just to fix what in my opinion
>> is an incorrect behaviour: the lack of use of CMA when the flag
>> "is_coherent" is set.
>
> But you still have to fix it properly: "is_coherent" means cacheable
> memory which you don't get with your patch.
>
>> Of course it still exists the problem of modify the attribute to make
>> the memory cacheable, but it is something I would like to do in a
>> second step (the patch you posted is of course a good starting point).
>
> So between the first and the second step, you basically break
> dma_alloc_coherent() by moving the allocation from
> __alloc_simple_buffer() (returning cacheable memory) to
> __alloc_from_contiguous() which changes the memory attributes to
> whatever __get_dma_pgprot() returned (currently Normal Non-cacheable).
>

Maybe I'm losing something.
What I see is that dma_alloc_coherent() calls dma_alloc_attrs() with
attrs set to NULL.
Depending on DMA coherent settings the function
arm_coherent_dma_alloc() or arm_dma_alloc() is called. Functions has
similar behaviour and set prot according to __get_dma_pgprot() which
uses the pgprot_dmacoherent() attributes (in both cases), which
defines the memory bufferable and _non_ cacheable. So the memory has
the same attribute even if __alloc_simple_buffer() is used.
What I see is that only using the dma_alloc_writecombine() function
you can get cacheable memory attributes.

>> I think that the current implementation maps memory keeping non
>> cacheable attributes enable, because the 'attrs' parameter passed to
>> arm_dma_mmap() has no WRITE_COMBINE attribute set (according to
>> dma_mmap_coherent() in include/asm-generic/dma-mapping-common.h).
>
> At least on ARMv7, WRITE_COMBINE and Normal Non-cacheable are the same.

Yes, but the function __get_dma_pgprot() uses different flags
depending on attribute DMA_ATTR_WRITE_COMBINE: if defined the memory
is marked as cacheable.

>
>> I also notice this patch that is pending "[PATCH v3]
>> arm/mm/dma-mapping.c: Add arm_coherent_dma_mmap": it modifies the
>> mapping of memory for coherent DMA. I want to understand if the merge
>> of this patch requires any other modification to guarantee that
>> coherent memory is allocated with cacheable attributes.
>
> I think this patch will go in, it is already in linux-next.
>

Ok, thanks. Anyway I think it shouldn't affect the allocation stuffs.

Lorenzo

> --
> Catalin
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()

June 11, 2015, 2:50 pm

≫ Next: Re: [rtc-linux] RE: [PATCH V3 1/4] mfd: da9062: DA9062 MFD core driver

≪ Previous: Re: [RFC PATCH v2] arm DMA: Fix allocation from CMA for coherent DMA

On Thu, Jun 11, 2015 at 08:33:41AM -0700, Paul E. McKenney wrote:
> On Thu, Jun 11, 2015 at 02:46:47PM +0200, Peter Zijlstra wrote:
> > Introduce raw_write_seqcount_barrier(), a new construct that can be
> > used to provide write barrier semantics in seqcount read loops instead
> > of the usual consistency guarantee.
> >
> > Cc: Al Viro <viro@ZenIV.linux.org.uk>
> > Cc: Linus Torvalds <torvalds@linux-foundation.org>
> > Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
> > Suggested-by: Oleg Nesterov <oleg@redhat.com>
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> > include/linux/seqlock.h | 42 ++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 42 insertions(+)
> >
> > --- a/include/linux/seqlock.h
> > +++ b/include/linux/seqlock.h
> > @@ -233,6 +233,48 @@ static inline void raw_write_seqcount_en
> > s->sequence++;
> > }
> >
> > +/**
> > + * raw_write_seqcount_barrier - do a seq write barrier
> > + * @s: pointer to seqcount_t
> > + *
> > + * This can be used to provide an ordering guarantee instead of the
> > + * usual consistency guarantee. It is one wmb cheaper, because we can
> > + * collapse the two back-to-back wmb()s.
> > + *
> > + * seqcount_t seq;
> > + * bool X = true, Y = false;
> > + *
> > + * void read(void)
> > + * {
> > + * bool x, y;
> > + *
> > + * do {
> > + * int s = read_seqcount_begin(&seq);
> > + *
> > + * x = X; y = Y;
> > + *
> > + * } while (read_seqcount_retry(&seq, s));
> > + *
> > + * BUG_ON(!x && !y);
> > + * }
> > + *
> > + * void write(void)
> > + * {
> > + * Y = true;
> > + *
> > + * write_seqcount_begin(seq);
> > + * write_seqcount_end(seq);
> > + *
> > + * X = false;
> > + * }
>
> So when using this, write() would instead look like this?
>
> void write(void)
> {
> Y = true;
> raw_write_seqcount_barrier(seq);
> X = false;
> }
>
> I suggest calling this out explicitly. Agreed, it should be obvious,
> but some poor sot is going to be reading this at 3AM local time after
> a couple days of no sleep, in which case obvious might not be so obvious.
>
> I also would suggest READ_ONCE() and WRITE_ONCE() to keep the compiler
> trickiness down to a dull roar. Understood, it is hard to make anything
> bad happen in this case, but small changes could result in badness.
>
> > + */
> > +static inline void raw_write_seqcount_barrier(seqcount_t *s)
> > +{
> > + s->sequence++;
> > + smp_wmb();
> > + s->sequence++;
> > +}
> > +
> > /*
> > * raw_write_seqcount_latch - redirect readers to even/odd copy
> > * @s: pointer to seqcount_t
>
> Looks good otherwise.
>
> Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Color me slow and stupid. Maybe due to reviewing a patch too early in
the morning, who knows?

There is nothing above that prevents the compiler and the CPU from
reordering the assignments to X and Y with the increment of s->sequence++.
One fix would be as follows:

static inline void raw_write_seqcount_barrier(seqcount_t *s)
{
smp_wmb();
s->sequence++;
smp_wmb();
s->sequence++;
smp_wmb();
}

Of course, this assumes that the accesses surrounding the call to
raw_write_seqcount_barrier() are writes. If they can be a reads,
the two added smp_wmb() calls need to be full barriers.

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

Re: [rtc-linux] RE: [PATCH V3 1/4] mfd: da9062: DA9062 MFD core driver

June 11, 2015, 2:50 pm

≫ Next: Re: [PATCH v2] MAINTAINERS: ARM64: EXYNOS: Extend entry for ARM64 DTS

≪ Previous: Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()

Hi,

On 11/06/2015 at 09:27:45 +0000, Opensource [Steve Twiss] wrote :
>
> Thanks for replying so quickly.
>
> > I'm still not particularly happy with this. Can yo speak to your H/W
> > guys and get them to change their scripts to output sensible header
> > files?
>
> Ah. Ok.
>
> For our side, the generated headers might not just be used for Linux. I've
> just discussed this with my colleagues and they will need it to remain.
> So I guess internally we will keep the headers like this, but as it enters
> my submission process I can change it for the Linux community.
>
> > To be honest, it's probably not a blocker for acceptance, but if someone
> > writes a patch next week to change all of the (0x01 << X) lines to
> > start using the BIT() macro, I will accept it. Better to influenced
> > your guys so you are not overly inconvenienced.
>
> Yep: I will change this BIT() macro for the submissions in future.
> Depending on the next step, I will send a patch to this or update the submission
> if there are further comments on this patch set.
>

Maybe a good solution would be to define BIT() at the top of your
generated header so that you simply have to remove it or replace it with
the proper include before mainline submission.

> > FWIW, when upstreaming code, the excuse "someone else wrote it", has
> > never been a good one to use on the lists. Believe me, I've
> > tried. ;)
>
> heh okay :)
>
> Regards,
> Steve
>
> --
> --
> You received this message because you are subscribed to "rtc-linux".
> Membership options at http://groups.google.com/group/rtc-linux .
> Please read http://groups.google.com/group/rtc-linux/web/checklist
> before submitting a driver.
> ---
> You received this message because you are subscribed to the Google Groups "rtc-linux" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to rtc-linux+unsubscribe@googlegroups.com.
> For more options, visit https://groups.google.com/d/optout.

--
Alexandre Belloni, Free Electrons
Embedded Linux, Kernel and Android engineering
http://free-electrons.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

Re: [PATCH v2] MAINTAINERS: ARM64: EXYNOS: Extend entry for ARM64 DTS

June 11, 2015, 2:50 pm

≫ Next: Re: [PATCH 07/12] x86/virt/guest/xen: Remove use of pgd_list from the Xen guest code

≪ Previous: Re: [rtc-linux] RE: [PATCH V3 1/4] mfd: da9062: DA9062 MFD core driver

Hi Krzysztof,

On Sat, Jun 6, 2015 at 3:02 AM, Krzysztof Kozlowski
<k.kozlowski@samsung.com> wrote:
> Extend the Exynos entry to ARM64 device tree sources.
>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will.deacon@arm.com>
> Cc: Russell King <linux@arm.linux.org.uk>
> Cc: Kukjin Kim <kgene@kernel.org>
> Cc: Kevin Hilman <khilman@kernel.org>
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Olof Johansson <olof@lixom.net>
> Cc: linux-samsung-soc@vger.kernel.org
> Cc: linux-arm-kernel@lists.infradead.org
> Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
> Reviewed-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>

Applied to arm-soc's next/soc branch.

Note I found this by chance. It almost fell through the cracks
because arm@kernel.org (for arm-soc maintainers) wasn't on the to/cc
list.

Kevin
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

Re: [PATCH 07/12] x86/virt/guest/xen: Remove use of pgd_list from the Xen guest code

June 12, 2015, 1:40 pm

≫ Next: RE: [PATCH 1/1] perf,tools: add time out to force stop endless mmap processing

≪ Previous: Re: [PATCH v2] MAINTAINERS: ARM64: EXYNOS: Extend entry for ARM64 DTS

On 06/12, Ingo Molnar wrote:
>
> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> > So I think the only issue is that ->mm can become NULL when the thread group
> > leader dies - a non-NULL mm should always be shared among all threads.
>
> Indeed, we do that in exit_mm().

Yes,

> So we could add tsk->mm_leader or so,

No, no, please do not. Just do something like

for_each_process(p) {

for_each_thread(p, t) {
if (t->mm) {
do_something(t->mm);
break;
}
}
}

But either way I don't understand what protects this ->mm. Perhaps this needs
find_lock_task_mm().

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

RE: [PATCH 1/1] perf,tools: add time out to force stop endless mmap processing

June 12, 2015, 1:50 pm

≫ Next: Re: [PATCH] staging: comedi: replace ENOSYS by proper error codes

≪ Previous: Re: [PATCH 07/12] x86/virt/guest/xen: Remove use of pgd_list from the Xen guest code

>
> On 6/12/15 12:19 PM, Liang, Kan wrote:
> >>> [perf]$ sudo ./perf record -e instructions:pp --pid 14560 Reading
> >>> /proc/14560/maps cost 13.12690599 s ^C[ perf record: Woken up 1
> >>> times to write data ] [ perf record: Captured and wrote 0.108 MB
> >>> perf.data
> >>> (2783 samples) ]
> >>
> >> so perf was able to read the proc file?
> >
> > Yes, perf always can read proc file. The problem is that the proc file
> > is huge and keep growing faster than proc reader.
> > So perf top do loop in perf_event__synthesize_mmap_events until the
> > test case exit.
>
> I'm confused. How are you getting the above time to read /proc maps if it
> never finishes?

I just tried to simplify the issue for perf record. So you may noticed that
I only read one thread. There are several threads in the system.
Also, I do the perf record test when starting the test case.
The proc file is not that big.
For perf top, it will monitor whole system. So it never finishes.

>
> for this test case how does perf-record compare between proc and
> task_diag? You can use my command for both. It defaults to using
> task_diag and then you can add --no-task_diag to have it read /proc. And
> as mentioned before it is only setup for 'perf record -a' case. So
>
> launch your test program
> perf record -a -- usleep 1
> perf record -a --no-task_diag -- usleep 1
>

Here are the test results.
Please note that I get "synthesized threads took..." after the test case exit.
It means both way have the same issue.

[perf]$ sudo ./perf record -a -- usleep 1
synthesized threads took 278.780762 sec
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.617 MB perf.data (7974 samples) ]

[perf]$ sudo ./perf record -a --no-task_diag -- usleep 1
synthesized threads took 315.612403 sec
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.443 MB perf.data (2754 samples) ]

Thanks,
Kan

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

Re: [PATCH] staging: comedi: replace ENOSYS by proper error codes

June 12, 2015, 1:50 pm

≫ Next: Re: [PATCH v7 06/10] trace: Add lock-free tracing_map

≪ Previous: RE: [PATCH 1/1] perf,tools: add time out to force stop endless mmap processing

On Fri, Jun 12, 2015 at 10:20:38PM +0200, julien.dehee@gmail.com wrote:
> From: Julien Dehee <julien.dehee@gmail.com>
>
> comedi/comedi_fops.c
> use ENODEV following open manual
>
> comedi/drivers.c
> use ENOTTY following ioctl manual
>
> drivers/serial2002.c
> use ENOTTY following ioctl manual

What do you mean by "ioctl manual"?

And shouldn't this be 3 different patches as they do different things to
different drivers?

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

Re: [PATCH v7 06/10] trace: Add lock-free tracing_map

June 12, 2015, 1:50 pm

≫ Next: Re: [PATCH v7 06/10] trace: Add lock-free tracing_map

≪ Previous: Re: [PATCH] staging: comedi: replace ENOSYS by proper error codes

On Mon, 8 Jun 2015 16:32:05 -0500
Tom Zanussi <tom.zanussi@linux.intel.com> wrote:

> +/**
> + * tracing_map_read_sum - Return the value of a tracing_map_elt's sum
> + * @elt: The tracing_map_elt

Eggs, lettuce, and tomato! Yummmm!

> +static int cmp_entries_dup(const struct tracing_map_sort_entry **a,
> + const struct tracing_map_sort_entry **b)
> +{
> + int ret = 0;
> +
> + if (memcmp((*a)->key, (*b)->key, (*a)->elt->map->key_size))
> + ret = 1;
> +
> + return ret;
> +}
> +
> +static int cmp_entries_sum(const struct tracing_map_sort_entry **a,
> + const struct tracing_map_sort_entry **b)
> +{
> + const struct tracing_map_elt *elt_a, *elt_b;
> + struct tracing_map_sort_key *sort_key;
> + struct tracing_map_field *field;
> + tracing_map_cmp_fn_t cmp_fn;
> + void *val_a, *val_b;
> + int ret = 0;
> +
> + elt_a = (*a)->elt;
> + elt_b = (*b)->elt;
> +
> + sort_key = &elt_a->map->sort_key;
> +
> + field = &elt_a->fields[sort_key->field_idx];
> + cmp_fn = field->cmp_fn;
> +
> + val_a = &elt_a->fields[sort_key->field_idx].sum;
> + val_b = &elt_b->fields[sort_key->field_idx].sum;
> +
> + ret = cmp_fn(val_a, val_b);
> + if (sort_key->descending)
> + ret = -ret;
> +
> + return ret;
> +}
> +
> +static int cmp_entries_key(const struct tracing_map_sort_entry **a,
> + const struct tracing_map_sort_entry **b)
> +{
> + const struct tracing_map_elt *elt_a, *elt_b;
> + struct tracing_map_sort_key *sort_key;
> + struct tracing_map_field *field;
> + tracing_map_cmp_fn_t cmp_fn;
> + void *val_a, *val_b;
> + int ret = 0;
> +
> + elt_a = (*a)->elt;
> + elt_b = (*b)->elt;
> +
> + sort_key = &elt_a->map->sort_key;
> +
> + field = &elt_a->fields[sort_key->field_idx];
> +
> + cmp_fn = field->cmp_fn;
> +
> + val_a = elt_a->key + field->offset;
> + val_b = elt_b->key + field->offset;
> +
> + ret = cmp_fn(val_a, val_b);
> + if (sort_key->descending)
> + ret = -ret;
> +
> + return ret;
> +}
> +
> +static void destroy_sort_entry(struct tracing_map_sort_entry *entry)
> +{
> + if (!entry)
> + return;
> +
> + if (entry->elt_copied)
> + tracing_map_elt_free(entry->elt);
> +
> + kfree(entry);
> +}
> +
> +/**
> + * tracing_map_destroy_entries - Destroy a tracing_map_sort_entries() array
> + * @entries: The entries to destroy
> + * @n_entries: The number of entries in the array
> + *
> + * Destroy the elements returned by a tracing_map_sort_entries() call.
> + */
> +void tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries,
> + unsigned int n_entries)
> +{
> + unsigned int i;
> +
> + for (i = 0; i < n_entries; i++)
> + destroy_sort_entry(entries);
> +}
> +
> +static struct tracing_map_sort_entry *
> +create_sort_entry(void *key, struct tracing_map_elt *elt)
> +{
> + struct tracing_map_sort_entry *sort_entry;
> +
> + sort_entry = kzalloc(sizeof(*sort_entry), GFP_KERNEL);
> + if (!sort_entry)
> + return NULL;
> +
> + sort_entry->key = key;
> + sort_entry->elt = elt;
> +
> + return sort_entry;
> +}
> +
> +static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
> +{
> + struct tracing_map_elt *dup_elt;
> + unsigned int i;
> +
> + dup_elt = tracing_map_elt_alloc(elt->map);
> + if (!dup_elt)
> + return NULL;
> +
> + if (elt->map->ops && elt->map->ops->elt_copy)
> + elt->map->ops->elt_copy(dup_elt, elt);
> +
> + dup_elt->private_data = elt->private_data;
> + memcpy(dup_elt->key, elt->key, elt->map->key_size);
> +
> + for (i = 0; i < elt->map->n_fields; i++) {
> + atomic64_set(&dup_elt->fields.sum,
> + atomic64_read(&elt->fields.sum));
> + dup_elt->fields.cmp_fn = elt->fields.cmp_fn;
> + }
> +
> + return dup_elt;
> +}
> +
> +static int merge_dup(struct tracing_map_sort_entry **sort_entries,
> + unsigned int target, unsigned int dup)
> +{
> + struct tracing_map_elt *target_elt, *elt;
> + bool first_dup = (target - dup) == 1;
> + int i;
> +
> + if (first_dup) {
> + elt = sort_entries[target]->elt;
> + target_elt = copy_elt(elt);
> + if (!target_elt)
> + return -ENOMEM;
> + sort_entries[target]->elt = target_elt;
> + sort_entries[target]->elt_copied = true;
> + } else
> + target_elt = sort_entries[target]->elt;
> +
> + elt = sort_entries[dup]->elt;
> +
> + for (i = 0; i < elt->map->n_fields; i++)
> + atomic64_add(atomic64_read(&elt->fields.sum),
> + &target_elt->fields.sum);
> +
> + sort_entries[dup]->dup = true;
> +
> + return 0;
> +}
> +
> +static int merge_dups(struct tracing_map_sort_entry **sort_entries,
> + int n_entries, unsigned int key_size)
> +{
> + unsigned int dups = 0, total_dups = 0;
> + int err, i, j;
> + void *key;
> +
> + if (n_entries < 2)
> + return total_dups;
> +
> + sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *),
> + (int (*)(const void *, const void *))cmp_entries_dup, NULL);

Sort.

> +
> + key = sort_entries[0]->key;
> + for (i = 1; i < n_entries; i++) {
> + if (!memcmp(sort_entries->key, key, key_size)) {
> + dups++; total_dups++;
> + err = merge_dup(sort_entries, i - dups, i);
> + if (err)
> + return err;
> + continue;
> + }
> + key = sort_entries->key;
> + dups = 0;
> + }
> +
> + if (!total_dups)
> + return total_dups;
> +
> + for (i = 0, j = 0; i < n_entries; i++) {
> + if (!sort_entries->dup) {
> + sort_entries[j] = sort_entries;
> + if (j++ != i)
> + sort_entries = NULL;
> + } else {
> + destroy_sort_entry(sort_entries);
> + sort_entries = NULL;
> + }
> + }
> +
> + return total_dups;
> +}
> +
> +static bool is_key(struct tracing_map *map, unsigned int field_idx)
> +{
> + unsigned int i;
> +
> + for (i = 0; i < map->n_keys; i++)
> + if (map->key_idx == field_idx)
> + return true;
> + return false;
> +}
> +
> +static void sort_secondary(struct tracing_map *map,
> + const struct tracing_map_sort_entry **entries,
> + unsigned int n_entries,
> + struct tracing_map_sort_key *primary_key,
> + struct tracing_map_sort_key *secondary_key)
> +{
> + int (*primary_fn)(const struct tracing_map_sort_entry **,
> + const struct tracing_map_sort_entry **);
> + int (*secondary_fn)(const struct tracing_map_sort_entry **,
> + const struct tracing_map_sort_entry **);
> + unsigned i, start = 0, n_sub = 1;
> +
> + if (is_key(map, primary_key->field_idx))
> + primary_fn = cmp_entries_key;
> + else
> + primary_fn = cmp_entries_sum;
> +
> + if (is_key(map, secondary_key->field_idx))
> + secondary_fn = cmp_entries_key;
> + else
> + secondary_fn = cmp_entries_sum;
> +
> + for (i = 0; i < n_entries - 1; i++) {
> + const struct tracing_map_sort_entry **a = &entries;
> + const struct tracing_map_sort_entry **b = &entries[i + 1];
> +
> + if (primary_fn(a, b) == 0) {
> + n_sub++;
> + if (i < n_entries - 2)
> + continue;
> + }
> +
> + if (n_sub < 2) {
> + start = i + 1;
> + n_sub = 1;
> + continue;
> + }
> +
> + set_sort_key(map, secondary_key);
> + sort(&entries[start], n_sub,
> + sizeof(struct tracing_map_sort_entry *),
> + (int (*)(const void *, const void *))secondary_fn, NULL);
> + set_sort_key(map, primary_key);
> +
> + start = i + 1;
> + n_sub = 1;
> + }
> +}
> +
> +/**
> + * tracing_map_sort_entries - Sort the current set of tracing_map_lts in a map
> + * @map: The tracing_map
> + * @sort_key: The sort key to use for sorting
> + * @sort_entries: outval: pointer to allocated and sorted array of entries
> + *
> + * tracing_map_sort_entries() sorts the current set of entries in the
> + * map and returns the list of tracing_map_sort_entries containing
> + * them to the client in the sort_entries param.
> + *
> + * The sort_key has only two fields: idx and descending. 'idx' refers
> + * to the index of the field added via tracing_map_add_sum_field() or
> + * tracing_map_add_key_field() when the tracing_map was initialized.
> + * 'descending' is a flag that if set reverses the sort order, which
> + * by default is ascending.
> + *
> + * The client should not hold on to the returned array but use it
> + * and call tracing_map_destroy_sort_entries() when done.
> + *
> + * Return: the number of sort_entries in the tracing_map_sort_entry
> + * array, negative on err
> + */
> +int tracing_map_sort_entries(struct tracing_map *map,
> + struct tracing_map_sort_key *sort_keys,
> + unsigned int n_sort_keys,
> + struct tracing_map_sort_entry ***sort_entries)
> +{
> + int (*cmp_entries_fn)(const struct tracing_map_sort_entry **,
> + const struct tracing_map_sort_entry **);
> + struct tracing_map_sort_entry *sort_entry, **entries;
> + int i, n_entries, ret;
> +
> + entries = kcalloc(map->max_elts, sizeof(sort_entry), GFP_KERNEL);
> + if (!entries)
> + return -ENOMEM;
> +
> + for (i = 0, n_entries = 0; i < map->map_size; i++) {
> + if (!map->map.key || !map->map.val)
> + continue;
> +
> + entries[n_entries] = create_sort_entry(map->map.val->key,
> + map->map.val);
> + if (!entries[n_entries++]) {
> + ret = -ENOMEM;
> + goto free;
> + }
> + }
> +
> + if (n_entries == 0) {
> + ret = 0;
> + goto free;
> + }
> +
> + if (n_entries == 1) {
> + *sort_entries = entries;
> + return 1;
> + }
> +
> + ret = merge_dups(entries, n_entries, map->key_size);

So this sorts.

> + if (ret < 0)
> + goto free;
> + n_entries -= ret;
> +
> + if (is_key(map, sort_keys[0].field_idx))
> + cmp_entries_fn = cmp_entries_key;
> + else
> + cmp_entries_fn = cmp_entries_sum;
> +
> + set_sort_key(map, &sort_keys[0]);
> +
> + sort(entries, n_entries, sizeof(struct tracing_map_sort_entry *),
> + (int (*)(const void *, const void *))cmp_entries_fn, NULL);

Then this sorts.

Why the double sort? Can't you just sort once, and then remove the dups?

-- Steve

> +
> + if (n_sort_keys > 1)
> + sort_secondary(map,
> + (const struct tracing_map_sort_entry **)entries,
> + n_entries,
> + &sort_keys[0],
> + &sort_keys[1]);
> +
> + *sort_entries = entries;
> +
> + return n_entries;
> + free:
> + tracing_map_destroy_sort_entries(entries, n_entries);
> +
> + return ret;
> +}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

Re: [PATCH v7 06/10] trace: Add lock-free tracing_map

June 12, 2015, 2:00 pm

≫ Next: Re: [PATCH 1/1] perf,tools: add time out to force stop endless mmap processing

≪ Previous: Re: [PATCH v7 06/10] trace: Add lock-free tracing_map

On Mon, 8 Jun 2015 16:32:05 -0500
Tom Zanussi <tom.zanussi@linux.intel.com> wrote:

> +/**
> + * tracing_map_init - Allocate and clear a map's tracing_map_elts
> + * @map: The tracing_map to initialize
> + *
> + * Creates and sets up a map to contain a max_size number of entries
> + * equal to a size of 2 ** map_bits. Before using, the map fields
> + * should be added to the map with tracing_map_add_key_field() and
> + * tracing_map_add_key_field(). tracing_map_init() should then be

Hmm, it's suppose to be added with tracing_map_add_key_field() twice?

-- Steve

> + * called to allocate the array of tracing_map_elts, in order to avoid
> + * allocating anything in the map insertion path. The user-specified
> + * map_size reflect the max number of entries requested by the user -
> + * internally we double that in order to keep the table sparse and
> + * keep collisions manageable.
> + *
> + * See tracing_map.h for a description of tracing_map_ops.
> + *
> + * Return: the tracing_map * if successful, ERR_PTR if not.
> + */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

Re: [PATCH 1/1] perf,tools: add time out to force stop endless mmap processing

June 12, 2015, 2:00 pm

≫ Next: Re: [PATCH 07/12] x86/virt/guest/xen: Remove use of pgd_list from the Xen guest code

≪ Previous: Re: [PATCH v7 06/10] trace: Add lock-free tracing_map

On 6/12/15 2:39 PM, Liang, Kan wrote:
> Here are the test results.
> Please note that I get "synthesized threads took..." after the test case exit.
> It means both way have the same issue.

Got it. So what you really mean is launching perf on an already running
process perf never finishes initializing. There are several types of
problems like this. For example on a sparc system with a 1024 cpus if I
launch perf (top or record) after starting a kernel build with make -j
1024 the build finishes before perf starts collecting samples. ie., it
never finishes walking /proc until the build is complete. task_diag does
not solve that problem either and in general the procps tools can't
handle it either (ps or top for example).

For your test case what happens if you run:
perf record -- test-app

Is perf overloaded with mmap samples? does it keep up or do you have to
jack the mmap size (-m arg)?

>
> [perf]$ sudo ./perf record -a -- usleep 1
> synthesized threads took 278.780762 sec
> [ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 0.617 MB perf.data (7974 samples) ]
>
> [perf]$ sudo ./perf record -a --no-task_diag -- usleep 1
> synthesized threads took 315.612403 sec
> [ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 0.443 MB perf.data (2754 samples) ]
>

ok.

Thanks,
David

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

Re: [PATCH 07/12] x86/virt/guest/xen: Remove use of pgd_list from the Xen guest code

June 12, 2015, 2:00 pm

≫ Next: Re: [PATCH non-pretimeout 3/7] ARM64: add SBSA Generic Watchdog device node in amd-seattle-soc.dtsi

≪ Previous: Re: [PATCH 1/1] perf,tools: add time out to force stop endless mmap processing

On 06/12, Oleg Nesterov wrote:
>
> On 06/12, Ingo Molnar wrote:
> >
> > * Linus Torvalds <torvalds@linux-foundation.org> wrote:
> >
> > > So I think the only issue is that ->mm can become NULL when the thread group
> > > leader dies - a non-NULL mm should always be shared among all threads.
> >
> > Indeed, we do that in exit_mm().
>
> Yes,
>
> > So we could add tsk->mm_leader or so,
>
> No, no, please do not. Just do something like
>
> for_each_process(p) {
>
> for_each_thread(p, t) {
> if (t->mm) {
> do_something(t->mm);
> break;
> }
> }
> }
>
> But either way I don't understand what protects this ->mm. Perhaps this needs
> find_lock_task_mm().

And, I don't understand this code, probably this doesn't matter, but.

unpin_all() is probably fine, but xen_mm_pin_all() can race with fork()
and miss the new child. Is it OK?

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

Re: [PATCH non-pretimeout 3/7] ARM64: add SBSA Generic Watchdog device node in amd-seattle-soc.dtsi

June 12, 2015, 2:00 pm

≫ Next: Re: [PATCH] staging: comedi: replace ENOSYS by proper error codes

≪ Previous: Re: [PATCH 07/12] x86/virt/guest/xen: Remove use of pgd_list from the Xen guest code

On 06/10/2015 12:47 PM, fu.wei@linaro.org wrote:
> + reg = <0x0 0xe0bb0000 0 0x10000>,
> + <0x0 0xe0bc0000 0 0x10000>;

I think the sizes are wrong. They should be 0x1000 instead of 0x10000.

--
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

Re: [PATCH] staging: comedi: replace ENOSYS by proper error codes

June 12, 2015, 2:10 pm

≫ Next: Re: [PATCH 17/20] [SCSI] mpt3sas: Use alloc_ordered_workqueue() API instead of create_singlethread_workqueue() API

≪ Previous: Re: [PATCH non-pretimeout 3/7] ARM64: add SBSA Generic Watchdog device node in amd-seattle-soc.dtsi

On Fri, 12 Jun 2015 13:43:27 -0700
Greg KH <gregkh@linuxfoundation.org> wrote:

> On Fri, Jun 12, 2015 at 10:20:38PM +0200, julien.dehee@gmail.com wrote:
> > From: Julien Dehee <julien.dehee@gmail.com>
> >
> > comedi/comedi_fops.c
> > use ENODEV following open manual

That should probably be ENXIO by a strict reading, but Linux has always
used ENODEV 8)

> >
> > comedi/drivers.c
> > use ENOTTY following ioctl manual
> >
> > drivers/serial2002.c
> > use ENOTTY following ioctl manual
>
> What do you mean by "ioctl manual"?

man 2 ioctl

Unknown ioctls on a device should error with ENOTTY. It's one of
those crazy pieces of Unix history.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

Re: [PATCH 17/20] [SCSI] mpt3sas: Use alloc_ordered_workqueue() API instead of create_singlethread_workqueue() API

June 12, 2015, 2:20 pm

≫ Next: perf,ftrace: fuzzer triggers warning in trace_events_filter code

≪ Previous: Re: [PATCH] staging: comedi: replace ENOSYS by proper error codes

On 06/12/2015 05:42 AM, Sreekanth Reddy wrote:
....
> +#if defined(alloc_ordered_workqueue)
> + ioc->firmware_event_thread = alloc_ordered_workqueue(
> + ioc->firmware_event_name, WQ_MEM_RECLAIM);
> +#else
> + ioc->firmware_event_thread = create_singlethread_workqueue(
> ioc->firmware_event_name);
> +#endif

Hi Sreekanth,

I think the upstream version of this code can safely assume
alloc_ordered_workqueue is defined, no?

Regards,

-- Joe
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

↧

Latest Images