[PATCH 7/9] powerpc/mm: Wire up ioremap_cache
The default implementation of ioremap_cache() is aliased to ioremap(). On powerpc ioremap() creates cache-inhibited mappings by default which is almost certainly not what you wanted. Signed-off-by: Oliver O'Halloran --- arch/powerpc/include/asm/io.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 5ed292431b5b..839eb031857f 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -757,6 +757,8 @@ extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size, extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size); #define ioremap_nocache(addr, size)ioremap((addr), (size)) #define ioremap_uc(addr, size) ioremap((addr), (size)) +#define ioremap_cache(addr, size) \ + ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL)) extern void iounmap(volatile void __iomem *addr); -- 2.9.3
[PATCH 6/9] powerpc, mm: Enable ZONE_DEVICE on powerpc
Flip the switch. Running around and screaming "IT'S ALIVE" is optional, but recommended. Signed-off-by: Oliver O'Halloran --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 43d000e44424..d696af58f97f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -724,7 +724,7 @@ config ZONE_DEVICE depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP - depends on X86_64 #arch_add_memory() comprehends device memory + depends on (X86_64 || PPC_BOOK3S_64) #arch_add_memory() comprehends device memory help Device memory hotplug support allows for establishing pmem, -- 2.9.3
[PATCH 5/9] powerpc/vmemmap: Add altmap support
Adds support to powerpc for the altmap feature of ZONE_DEVICE memory. An altmap is a driver provided region that is used to provide the backing storage for the struct pages of ZONE_DEVICE memory. In situations where large amount of ZONE_DEVICE memory is being added to the system the altmap reduces pressure on main system memory by allowing the mm/ metadata to be stored on the device itself rather in main memory. Signed-off-by: Oliver O'Halloran --- arch/powerpc/mm/init_64.c | 20 +++- arch/powerpc/mm/mem.c | 16 +--- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index f8124edb6ffa..225fbb8034e6 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -171,13 +172,17 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node); for (; start < end; start += page_size) { + struct vmem_altmap *altmap; void *p; int rc; if (vmemmap_populated(start, page_size)) continue; - p = vmemmap_alloc_block(page_size, node); + /* altmap lookups only work at section boundaries */ + altmap = to_vmem_altmap(SECTION_ALIGN_DOWN(start)); + + p = __vmemmap_alloc_block_buf(page_size, node, altmap); if (!p) return -ENOMEM; @@ -241,9 +246,10 @@ void __ref vmemmap_free(unsigned long start, unsigned long end) pr_debug("vmemmap_free %lx...%lx\n", start, end); for (; start < end; start += page_size) { - struct page *page = pfn_to_page(addr >> PAGE_SHIFT); - unsigned int nr_pages; - unsigned long addr; + unsigned long nr_pages, addr; + struct vmem_altmap *altmap; + struct page *section_base; + struct page *page; /* * the section has already be marked as invalid, so @@ -258,9 +264,13 @@ void __ref vmemmap_free(unsigned long start, unsigned long end) continue; page = pfn_to_page(addr >> PAGE_SHIFT); + section_base = pfn_to_page(vmemmap_section_start(start)); nr_pages = 1 << page_order; - if (PageReserved(page)) { + altmap = to_vmem_altmap((unsigned long) section_base); + if (altmap) { + vmem_altmap_free(altmap, nr_pages); + } else if (PageReserved(page)) { /* allocated from bootmem */ if (page_size < PAGE_SIZE) { /* diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 3bbba178b464..6f7b64eaa9d8 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -176,7 +177,8 @@ int arch_remove_memory(u64 start, u64 size, enum memory_type type) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; + struct vmem_altmap *altmap; + struct page *page; int ret; /* @@ -193,8 +195,16 @@ int arch_remove_memory(u64 start, u64 size, enum memory_type type) return -EINVAL; } - zone = page_zone(pfn_to_page(start_pfn)); - ret = __remove_pages(zone, start_pfn, nr_pages); + /* +* If we have an altmap then we need to skip over any reserved PFNs +* when querying the zone. +*/ + page = pfn_to_page(start_pfn); + altmap = to_vmem_altmap((unsigned long) page); + if (altmap) + page += vmem_altmap_offset(altmap); + + ret = __remove_pages(page_zone(page), start_pfn, nr_pages); if (ret) return ret; -- 2.9.3
[PATCH 4/9] powerpc/mm: Reshuffle vmemmap_free()
Removes an indentation level and shuffles some code around to make the following patch cleaner. No functional changes. Signed-off-by: Oliver O'Halloran --- arch/powerpc/mm/init_64.c | 47 +-- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index ec84b31c6c86..f8124edb6ffa 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -234,12 +234,15 @@ static unsigned long vmemmap_list_free(unsigned long start) void __ref vmemmap_free(unsigned long start, unsigned long end) { unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; + unsigned long page_order = get_order(page_size); start = _ALIGN_DOWN(start, page_size); pr_debug("vmemmap_free %lx...%lx\n", start, end); for (; start < end; start += page_size) { + struct page *page = pfn_to_page(addr >> PAGE_SHIFT); + unsigned int nr_pages; unsigned long addr; /* @@ -251,29 +254,29 @@ void __ref vmemmap_free(unsigned long start, unsigned long end) continue; addr = vmemmap_list_free(start); - if (addr) { - struct page *page = pfn_to_page(addr >> PAGE_SHIFT); - - if (PageReserved(page)) { - /* allocated from bootmem */ - if (page_size < PAGE_SIZE) { - /* -* this shouldn't happen, but if it is -* the case, leave the memory there -*/ - WARN_ON_ONCE(1); - } else { - unsigned int nr_pages = - 1 << get_order(page_size); - while (nr_pages--) - free_reserved_page(page++); - } - } else - free_pages((unsigned long)(__va(addr)), - get_order(page_size)); - - vmemmap_remove_mapping(start, page_size); + if (!addr) + continue; + + page = pfn_to_page(addr >> PAGE_SHIFT); + nr_pages = 1 << page_order; + + if (PageReserved(page)) { + /* allocated from bootmem */ + if (page_size < PAGE_SIZE) { + /* +* this shouldn't happen, but if it is +* the case, leave the memory there +*/ + WARN_ON_ONCE(1); + } else { + while (nr_pages--) + free_reserved_page(page++); + } + } else { + free_pages((unsigned long)(__va(addr)), page_order); } + + vmemmap_remove_mapping(start, page_size); } } #endif -- 2.9.3
[PATCH 3/9] powerpc/mm: Add _PAGE_DEVMAP for ppc64.
From: "Aneesh Kumar K.V" Add a _PAGE_DEVMAP bit for PTE and DAX PMD entires. PowerPC doesn't currently support PUD faults so we haven't extended it to the PUD level. Cc: Aneesh Kumar K.V Signed-off-by: Oliver O'Halloran --- arch/powerpc/include/asm/book3s/64/pgtable.h | 37 +++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index fb72ff6b98e6..b5fc6337649e 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -78,6 +78,9 @@ #define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */ #define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */ +#define _PAGE_DEVMAP _RPAGE_SW1 +#define __HAVE_ARCH_PTE_DEVMAP + /* * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE * Instead of fixing all of them, add an alternate define which @@ -602,6 +605,16 @@ static inline pte_t pte_mkhuge(pte_t pte) return pte; } +static inline pte_t pte_mkdevmap(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SPECIAL|_PAGE_DEVMAP); +} + +static inline int pte_devmap(pte_t pte) +{ + return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP)); +} + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { /* FIXME!! check whether this need to be a conditional */ @@ -966,6 +979,9 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd) #define pmd_mk_savedwrite(pmd) pte_pmd(pte_mk_savedwrite(pmd_pte(pmd))) #define pmd_clear_savedwrite(pmd) pte_pmd(pte_clear_savedwrite(pmd_pte(pmd))) +#define pud_pfn(...) (0) +#define pgd_pfn(...) (0) + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #define pmd_soft_dirty(pmd)pte_soft_dirty(pmd_pte(pmd)) #define pmd_mksoft_dirty(pmd) pte_pmd(pte_mksoft_dirty(pmd_pte(pmd))) @@ -1140,7 +1156,6 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, return true; } - #define arch_needs_pgtable_deposit arch_needs_pgtable_deposit static inline bool arch_needs_pgtable_deposit(void) { @@ -1149,6 +1164,26 @@ static inline bool arch_needs_pgtable_deposit(void) return true; } +static inline pmd_t pmd_mkdevmap(pmd_t pmd) +{ + return pte_pmd(pte_mkdevmap(pmd_pte(pmd))); +} + +static inline int pmd_devmap(pmd_t pmd) +{ + return pte_devmap(pmd_pte(pmd)); +} + +static inline int pud_devmap(pud_t pud) +{ + return 0; +} + +static inline int pgd_devmap(pgd_t pgd) +{ + return 0; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ -- 2.9.3
[PATCH 2/9] mm/huge_memory: Deposit a pgtable for DAX PMD faults when required
Although all architectures use a deposited page table for THP on anonymous VMAs some architectures (s390 and powerpc) require the deposited storage even for file backed VMAs due to quirks of their MMUs. This patch adds support for depositing a table in DAX PMD fault handling path for archs that require it. Other architectures should see no functional changes. Cc: "Aneesh Kumar K.V" Cc: linux...@kvack.org Signed-off-by: Oliver O'Halloran --- mm/huge_memory.c | 20 ++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index aa01dd47cc65..a84909cf20d3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -715,7 +715,8 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf) } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write) + pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, + pgtable_t pgtable) { struct mm_struct *mm = vma->vm_mm; pmd_t entry; @@ -729,6 +730,12 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, entry = pmd_mkyoung(pmd_mkdirty(entry)); entry = maybe_pmd_mkwrite(entry, vma); } + + if (pgtable) { + pgtable_trans_huge_deposit(mm, pmd, pgtable); + atomic_long_inc(&mm->nr_ptes); + } + set_pmd_at(mm, addr, pmd, entry); update_mmu_cache_pmd(vma, addr, pmd); spin_unlock(ptl); @@ -738,6 +745,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, pfn_t pfn, bool write) { pgprot_t pgprot = vma->vm_page_prot; + pgtable_t pgtable = NULL; /* * If we had pmd_special, we could avoid all these restrictions, * but we need to be consistent with PTEs and architectures that @@ -752,9 +760,15 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; + if (arch_needs_pgtable_deposit()) { + pgtable = pte_alloc_one(vma->vm_mm, addr); + if (!pgtable) + return VM_FAULT_OOM; + } + track_pfn_insert(vma, &pgprot, pfn); - insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); + insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); @@ -1611,6 +1625,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb->fullmm); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); if (vma_is_dax(vma)) { + if (arch_needs_pgtable_deposit()) + zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); if (is_huge_zero_pmd(orig_pmd)) tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); -- 2.9.3
[PATCH 1/9] mm/huge_memory: Use zap_deposited_table() more
Depending flags of the PMD being zapped there may or may not be a deposited pgtable to be freed. In two of the three cases this is open coded while the third uses the zap_deposited_table() helper. This patch converts the others to use the helper to clean things up a bit. Cc: "Aneesh Kumar K.V" Cc: "Kirill A. Shutemov" Cc: linux...@kvack.org Signed-off-by: Oliver O'Halloran --- For reference: void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) { pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, pmd); pte_free(mm, pgtable); atomic_long_dec(&mm->nr_ptes); } --- mm/huge_memory.c | 8 ++-- 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b787c4cfda0e..aa01dd47cc65 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1615,8 +1615,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (is_huge_zero_pmd(orig_pmd)) tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else if (is_huge_zero_pmd(orig_pmd)) { - pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); - atomic_long_dec(&tlb->mm->nr_ptes); + zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else { @@ -1625,10 +1624,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) { - pgtable_t pgtable; - pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); - pte_free(tlb->mm, pgtable); - atomic_long_dec(&tlb->mm->nr_ptes); + zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { if (arch_needs_pgtable_deposit()) -- 2.9.3
ZONE_DEVICE and pmem API support for powerpc
Hi all, This series adds support for ZONE_DEVICE and the pmem api on powerpc. Namely, support for altmaps and the various bits and pieces required for DAX PMD faults. The first two patches touch generic mm/ code, but otherwise this is fairly well contained in arch/powerpc. If the nvdimm folks could sanity check this series I'd appreciate it. Series is based on next-20170411, but it should apply elsewhere with minor fixups to arch_{add|remove}_memory due to conflicts with HMM. For those interested in testing this, there is a driver and matching firmware that carves out some system memory for use as an emulated Con Tutto memory card. Driver: https://github.com/oohal/linux/tree/contutto-next Firmware: https://github.com/oohal/skiboot/tree/fake-contutto Edit core/init.c:686 to control the amount of memory borrowed for the emulated device. I'm keeping the driver out of tree for a until 4.13 since I plan on reworking the firmware interface anyway and There's at least one showstopper bug. Thanks, Oliver
Re: [v8] powerpc/powernv: add 'firmware/exports' attributes to sysfs
On Thu, Mar 30, 2017 at 10:28 AM, Matt Brown wrote: > The HDAT data area is consumed by skiboot and turned into a device-tree. In > some cases we would like to look directly at the HDAT. This is not possible > through /dev/mem as it is reserved memory which is stopped by the /dev/mem > filter. There are also other memory areas which are reserved but could be > useful to view for debugging purposes. > > This patch adds sysfs nodes to allow specified memory areas to be viewed. > sysfs nodes are created for each property in the device-tree under > /ibm,opal/firmware/exports/, and adds them to /sys/firmware/opal/exports/ > with root read-only permissions. > > Signed-off-by: Matt Brown > --- > Changelog > v8 > - fixed error handling > - added dynamic allocation of attributes > - using of_property_read_u64_array for reading attr vals > - reordered vars > - renaming vars > --- > arch/powerpc/platforms/powernv/opal.c | 81 > +++ > 1 file changed, 81 insertions(+) > > diff --git a/arch/powerpc/platforms/powernv/opal.c > b/arch/powerpc/platforms/powernv/opal.c > index 2822935..232f94e 100644 > --- a/arch/powerpc/platforms/powernv/opal.c > +++ b/arch/powerpc/platforms/powernv/opal.c > @@ -604,6 +604,84 @@ static void opal_export_symmap(void) > pr_warn("Error %d creating OPAL symbols file\n", rc); > } > > +static ssize_t export_attr_read(struct file *fp, struct kobject *kobj, > +struct bin_attribute *bin_attr, char *buf, > +loff_t off, size_t count) > +{ > + return memory_read_from_buffer(buf, count, &off, bin_attr->private, > + bin_attr->size); > +} > + > +/* > + * opal_export_attrs: creates a sysfs node for each property listed in > + * the device-tree under /ibm,opal/firmware/exports/ > + * All new sysfs nodes are created under /opal/exports/. > + * This allows for reserved memory regions (e.g. HDAT) to be read. > + * The new sysfs nodes are only readable by root. > + */ > +static void opal_export_attrs(void) > +{ > + struct bin_attribute *attr_tmp; > + struct device_node *np; > + struct property *prop; > + struct kobject *kobj; > + u64 vals[2]; > + int rc, n; > + > + /* Create new 'exports' directory - /sys/firmware/opal/exports */ > + kobj = kobject_create_and_add("exports", opal_kobj); > + if (!kobj) { > + pr_warn("kobject_create_and_add exports failed\n"); > + return; > + } > + > + np = of_find_node_by_path("/ibm,opal/firmware/exports"); > + if (!np) > + return; > + > + n = 0; > + for (prop = np->properties; prop != NULL; prop = prop->next) > + n++; > + > + if (n < 2) > + goto cleanup; > + > + for_each_property_of_node(np, prop) { > + if (!strcmp(prop->name, "name") || > + !strcmp(prop->name, "phandle")) > + continue; > + > + if (of_property_read_u64_array(np, prop->name, &vals[0], 2)) > + continue; > + > + attr_tmp = kmalloc(sizeof(*attr_tmp), GFP_KERNEL); > + > + if (attr_tmp == NULL) { > + pr_warn("Failed kmalloc for bin_attribute attr_tmp"); > + continue; > + } > + > + attr_tmp->attr.name = kstrdup(prop->name, GFP_KERNEL); > + attr_tmp->attr.mode = 0400; > + attr_tmp->read = export_attr_read; > + attr_tmp->private = __va(vals[0]); > + attr_tmp->size = vals[1]; > + > + if (attr_tmp->attr.name == NULL) { > + pr_warn("Failed kstrdup for bin_attribute attr.name"); > + kfree(attr_tmp); > + continue; > + } > + rc = sysfs_create_bin_file(kobj, attr_tmp); > + if (rc) > + pr_warn("Error %d creating OPAL sysfs exports/%s > file\n", > + rc, prop->name); > + } > + > +cleanup: > + of_node_put(np); > +} > + > static void __init opal_dump_region_init(void) > { > void *addr; > @@ -742,6 +820,9 @@ static int __init opal_init(void) > opal_msglog_sysfs_init(); > } > > + /* Export all properties */ > + opal_export_attrs(); > + > /* Initialize platform devices: IPMI backend, PRD & flash interface */ > opal_pdev_init("ibm,opal-ipmi"); > opal_pdev_init("ibm,opal-flash"); > -- > 2.9.3 > Reviewed-by: Oliver O'Halloran
[PATCH] powerpc/mm: remove stale comment
The code to fix the problem it describes was removed in c40785a and it uses the stupid comment style. Away it goes! Signed-off-by: Oliver O'Halloran --- arch/powerpc/mm/hash_utils_64.c | 5 - 1 file changed, 5 deletions(-) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 8848fec..69a05b3 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -927,11 +927,6 @@ static void __init htab_initialize(void) } #endif /* CONFIG_DEBUG_PAGEALLOC */ - /* On U3 based machines, we need to reserve the DART area and -* _NOT_ map it to avoid cache paradoxes as it's remapped non -* cacheable later on -*/ - /* create bolted the linear mapping in the hash table */ for_each_memblock(memory, reg) { base = (unsigned long)__va(reg->base); -- 2.9.3
[PATCH] powerpc/misc: fix exported functions that reference the TOC
When the kernel is compiled to use 64bit ABIv2 the _GLOBAL() macro does not include a global entry point. A function's global entry point is used when the function is called from a different TOC context and in the kernel this typically means a call from a module into the vmlinux (or vis-a-vis). There are a few exported ASM functions declared with _GLOBAL() and calling them from a module will module will likely crash the kernel since any TOC relative load will yield garbage. To fix this use _GLOBAL_TOC() for exported asm functions rather than _GLOBAL() and some documentation about when to use each. Signed-off-by: Oliver O'Halloran --- arch/powerpc/include/asm/ppc_asm.h | 12 arch/powerpc/kernel/misc_64.S | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 359c443..3abf8c3 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -198,6 +198,18 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) #ifdef PPC64_ELF_ABI_v2 +/* + * When to use _GLOBAL_TOC() instead of _GLOBAL(): + * + * a) The function is exported using EXPORT_SYMBOL_*() + * *and* + * b) The function, or any function that it calls, references the TOC. + * + * In this situation _GLOBAL_TOC() is required because exported functions are + * callable from modules which may a different TOC to the kernel proper and the + * _GLOBAL() macro skips the TOC setup which is required on ELF ABIv2. + */ + #define _GLOBAL(name) \ .align 2 ; \ .type name,@function; \ diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index ec94aef..d18da8c 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -67,7 +67,7 @@ PPC64_CACHES: * flush all bytes from start through stop-1 inclusive */ -_GLOBAL(flush_icache_range) +_GLOBAL_TOC(flush_icache_range) BEGIN_FTR_SECTION PURGE_PREFETCHED_INS blr @@ -120,7 +120,7 @@ EXPORT_SYMBOL(flush_icache_range) * *flush all bytes from start to stop-1 inclusive */ -_GLOBAL(flush_dcache_range) +_GLOBAL_TOC(flush_dcache_range) /* * Flush the data cache to memory -- 2.9.3
[PATCH 2/2] powerpc/mm: add phys addr to linux page table dump
The current page table dumper scans the linux page tables and coalesces mappings with adjacent virtual addresses and similar PTE flags. This behaviour is somewhat broken when you consider the IOREMAP space where entirely unrelated mappings will appear to be contiguous. This patch modifies the range coalescing so that only ranges that are both physically and virtually contiguous are combined. This patch also adds to the dump output the physical address at the start of each range. Cc: Rashmica Gupta Signed-off-by: Oliver O'Halloran --- arch/powerpc/mm/dump_linuxpagetables.c | 18 -- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c index e7cbfd5a0940..85e6a45bd7ee 100644 --- a/arch/powerpc/mm/dump_linuxpagetables.c +++ b/arch/powerpc/mm/dump_linuxpagetables.c @@ -56,6 +56,8 @@ struct pg_state { struct seq_file *seq; const struct addr_marker *marker; unsigned long start_address; + unsigned long start_pa; + unsigned long last_pa; unsigned int level; u64 current_flags; }; @@ -265,7 +267,9 @@ static void dump_addr(struct pg_state *st, unsigned long addr) const char *unit = units; unsigned long delta; - seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1); + seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1); + seq_printf(st->seq, "%016lx ", st->start_pa); + delta = (addr - st->start_address) >> 10; /* Work out what appropriate unit to use */ while (!(delta & 1023) && unit[1]) { @@ -280,11 +284,15 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned int level, u64 val) { u64 flag = val & pg_level[level].mask; + u64 pa = val & PTE_RPN_MASK; + /* At first no level is set */ if (!st->level) { st->level = level; st->current_flags = flag; st->start_address = addr; + st->start_pa = pa; + st->last_pa = pa; seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); /* * Dump the section of virtual memory when: @@ -292,9 +300,11 @@ static void note_page(struct pg_state *st, unsigned long addr, * - we change levels in the tree. * - the address is in a different section of memory and is thus * used for a different purpose, regardless of the flags. +* - the pa of this page is not adjacent to the last inspected page */ } else if (flag != st->current_flags || level != st->level || - addr >= st->marker[1].start_address) { + addr >= st->marker[1].start_address || + pa != st->last_pa + PAGE_SIZE) { /* Check the PTE flags */ if (st->current_flags) { @@ -318,8 +328,12 @@ static void note_page(struct pg_state *st, unsigned long addr, seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); } st->start_address = addr; + st->start_pa = pa; + st->last_pa = pa; st->current_flags = flag; st->level = level; + } else { + st->last_pa = pa; } } -- 2.9.3
[PATCH 1/2] powerpc/mm: fix up pgtable dump flags
On Book3s we have two PTE flags used to mark cache-inhibited mappings: _PAGE_TOLERANT and _PAGE_NON_IDEMPOTENT. Currently the kernel page table dumper only looks at the generic _PAGE_NO_CACHE which is defined to be _PAGE_TOLERANT. This patch modifies the dumper so both flags are shown in the dump. Cc: Rashmica Gupta Signed-off-by: Oliver O'Halloran --- arch/powerpc/mm/dump_linuxpagetables.c | 13 + 1 file changed, 13 insertions(+) diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c index 49abaf4dc8e3..e7cbfd5a0940 100644 --- a/arch/powerpc/mm/dump_linuxpagetables.c +++ b/arch/powerpc/mm/dump_linuxpagetables.c @@ -154,11 +154,24 @@ static const struct flag_info flag_array[] = { .clear = " ", }, { #endif +#ifndef CONFIG_PPC_BOOK3S_64 .mask = _PAGE_NO_CACHE, .val= _PAGE_NO_CACHE, .set= "no cache", .clear = "", }, { +#else + .mask = _PAGE_NON_IDEMPOTENT, + .val= _PAGE_NON_IDEMPOTENT, + .set= "non-idempotent", + .clear = " ", + }, { + .mask = _PAGE_TOLERANT, + .val= _PAGE_TOLERANT, + .set= "tolerant", + .clear = "", + }, { +#endif #ifdef CONFIG_PPC_BOOK3S_64 .mask = H_PAGE_BUSY, .val= H_PAGE_BUSY, -- 2.9.3
Re: [PATCH 1/5] powerpc/smp: use cpu_to_chip_id() to find siblings
On Tue, Mar 28, 2017 at 2:03 PM, Michael Ellerman wrote: > Oliver O'Halloran writes: >> On Wed, Mar 15, 2017 at 10:18 PM, Michael Ellerman >> wrote: >>> Oliver O'Halloran writes: >>>> + /* threads that share a chip-id are considered siblings (same die) */ >>> >>> Also "Threads" :) >> >> The cpus masks are all built in terms of threads, so this is >> technically correct even if it sounds stupid. Maybe "logical cpus" >> would be better? > > No I meant you need a capital "T" ! capital letters are against my religion. > > cheers
Re: Build failure -- powerpc/boot: Add OPAL console to epapr wrappers
On Sat, Mar 25, 2017 at 4:00 AM, Daniel Walker wrote: > I get this build failure, > > > In file included from arch/powerpc/boot/fdt.c:51: > ../arch/powerpc/boot/libfdt_env.h:9: error: redefinition of typedef > 'uint32_t' > ../arch/powerpc/boot/types.h:20: note: previous declaration of 'uint32_t' > was here > ../arch/powerpc/boot/libfdt_env.h:10: error: redefinition of typedef > 'uint64_t' > ../arch/powerpc/boot/types.h:21: note: previous declaration of 'uint64_t' > was here > make[2]: *** [arch/powerpc/boot/fdt.o] Error 1 > make[1]: *** [uImage] Error 2 > make[1]: Leaving directory `/nobackup/danielwa/linux/t1040' > make: *** [sub-make] Error 2 > > > and it bisects to , > > > commit 656ad58ef19e2a763fa5c938b20ae0f6b8d67242 > Author: Oliver O'Halloran > Date: Fri Jul 1 00:34:37 2016 +1000 > > powerpc/boot: Add OPAL console to epapr wrappers > > This patch adds an OPAL console backend to the powerpc boot wrapper so > that decompression failures inside the wrapper can be reported to the > user. This is important since it typically indicates data corruption in > the firmware and other nasty things. > > Currently this only works when building a little endian kernel. When > compiling a 64 bit BE kernel the wrapper is always build 32 bit to be > compatible with some 32 bit firmwares. BE support will be added at a > later date. Another limitation of this is that only the "raw" type of > OPAL console is supported, however machines that provide a hvsi console > also provide a raw console so this is not an issue in practice. > > Actually-written-by: Benjamin Herrenschmidt > Signed-off-by: Oliver O'Halloran > [mpe: Move #ifdef __powerpc64__ to avoid warnings on 32-bit] > Signed-off-by: Michael Ellerman > > > I can provide a config file if needed. My apologies if this was already > reported. Thanks for the report, I don't think this is a known bug. mpe's build testing is pretty thorough so I'm surprised this wasn't caught sooner. A config file and the version of gcc that you're using would be useful. Oliver
[PATCH v2] powerpc/powernv: de-deuplicate OPAL call wrappers
Currently the code to perform an OPAL call is duplicated between the normal path and path taken when tracepoints are enabled. There's no real need for this and combining them makes opal_tracepoint_entry considerably easier to understand. Signed-off-by: Oliver O'Halloran --- v1 -> v2: slight rework due to the real mode opal call changes --- arch/powerpc/platforms/powernv/opal-wrappers.S | 53 +++--- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index da8a0f7a035c..ebf6719d241a 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -50,21 +50,13 @@ END_FTR_SECTION(0, 1); \ #define OPAL_BRANCH(LABEL) #endif -/* TODO: - * - * - Trace irqs in/off (needs saving/restoring all args, argh...) - * - Get r11 feed up by Dave so I can have better register usage +/* + * DO_OPAL_CALL assumes: + * r0 = opal call token + * r12 = msr + * LR has been saved */ - -#define OPAL_CALL(name, token) \ - _GLOBAL_TOC(name);\ - mfmsr r12;\ - mflrr0; \ - andi. r11,r12,MSR_IR|MSR_DR; \ - std r0,PPC_LR_STKOFF(r1); \ - li r0,token; \ - beq opal_real_call; \ - OPAL_BRANCH(opal_tracepoint_entry) \ +#define DO_OPAL_CALL() \ mfcrr11;\ stw r11,8(r1); \ li r11,0; \ @@ -83,6 +75,18 @@ END_FTR_SECTION(0, 1); \ mtspr SPRN_HSRR0,r12; \ hrfid +#define OPAL_CALL(name, token) \ + _GLOBAL_TOC(name);\ + mfmsr r12;\ + mflrr0; \ + andi. r11,r12,MSR_IR|MSR_DR; \ + std r0,PPC_LR_STKOFF(r1); \ + li r0,token; \ + beq opal_real_call; \ + OPAL_BRANCH(opal_tracepoint_entry) \ + DO_OPAL_CALL() + + opal_return: /* * Fixup endian on OPAL return... we should be able to simplify @@ -148,26 +152,13 @@ opal_tracepoint_entry: ld r8,STK_REG(R29)(r1) ld r9,STK_REG(R30)(r1) ld r10,STK_REG(R31)(r1) + + /* setup LR so we return via tracepoint_return */ LOAD_REG_ADDR(r11,opal_tracepoint_return) - mfcrr12 std r11,16(r1) - stw r12,8(r1) - li r11,0 + mfmsr r12 - ori r11,r11,MSR_EE - std r12,PACASAVEDMSR(r13) - andcr12,r12,r11 - mtmsrd r12,1 - LOAD_REG_ADDR(r11,opal_return) - mtlrr11 - li r11,MSR_DR|MSR_IR|MSR_LE - andcr12,r12,r11 - mtspr SPRN_HSRR1,r12 - LOAD_REG_ADDR(r11,opal) - ld r12,8(r11) - ld r2,0(r11) - mtspr SPRN_HSRR0,r12 - hrfid + DO_OPAL_CALL() opal_tracepoint_return: std r3,STK_REG(R31)(r1) -- 2.9.3
Re: [PATCH 4/5] powerpc/smp: add cpu_cache_mask
On Wed, Mar 15, 2017 at 10:26 PM, Michael Ellerman wrote: > Oliver O'Halloran writes: > >> Traditionally we have only ever tracked which CPUs are in the same core >> (cpu_sibling_mask) and on the same die (cpu_core_mask). For Power9 we >> need to be aware of which CPUs share cache with each other so this patch >> adds cpu_cache_mask and the underlying cpu_cache_map variable to track >> this. > > But which cache? I'm not sure it matters. All the scheduler really wants to know is that that migrating between cpus with a shared cache is cheaper than migrating elsewhere. > Some CPUs on Power8 share L3, or L4. Eh... it's not really the same. The "L4" is part of the memory buffers and it's function is conceptually different to the processor caches. The L3 on P8 is only shared when the core that owns is offline (or sleeping) so the scheduler doesn't really need to be aware of it. Even if the scheduler was aware I don't think it can take advantage of it without some terrible hacks. > > I think just call it cpu_l2cache_map to make it explicit. I was being deliberately vague. I know it's only a shared currently, but it's possible we might have a (real) shared L3 in the future. The latest high-end x86 chips have some of l3 sharing across the entire chip so you never know. I'm not particularly attached to the name though, so i'll rename it if you really want. Oliver
Re: [PATCH 2/5] powerpc/smp: add set_cpus_related()
On Wed, Mar 15, 2017 at 10:18 PM, Michael Ellerman wrote: > Oliver O'Halloran writes: >> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c >> index dfe0e1d9cd06..1c531887ca51 100644 >> --- a/arch/powerpc/kernel/smp.c >> +++ b/arch/powerpc/kernel/smp.c >> @@ -377,6 +377,25 @@ static void smp_store_cpu_info(int id) >> #endif >> } >> >> +/* >> + * Relationships between CPUs are maintained in a set of per-cpu cpumasks. >> We >> + * need to ensure that they are kept consistant between CPUs when they are >> + * changed. >> + * >> + * This is slightly tricky since the core mask must be a strict superset of >> + * the sibling mask. >> + */ >> +static void set_cpus_related(int i, int j, bool related, struct cpumask >> *(*relation_fn)(int)) >> +{ >> + if (related) { >> + cpumask_set_cpu(i, relation_fn(j)); >> + cpumask_set_cpu(j, relation_fn(i)); >> + } else { >> + cpumask_clear_cpu(i, relation_fn(j)); >> + cpumask_clear_cpu(j, relation_fn(i)); >> + } >> +} > > I think you pushed the abstraction one notch too far on this one, or > perhaps not far enough. > > We end up with a function called "set" that might clear, depending on a > bool you pass. Which is hard to parse, eg: > > set_cpus_related(cpu, base + i, false, cpu_sibling_mask); > > And I know there's two places where we pass an existing bool "add", but > there's four where we pass true or false. I think you're looking at this patch. With the full series applied we never pass a literal to set_cpus_related() directly: [12:14 oliver ~/.../powerpc/kernel (p9-sched $%)]$ gg set_cpus_related smp.c:391:static void set_cpus_related(int i, int j, bool related, struct cpumask *(*relation_fn)(int)) smp.c:647: set_cpus_related(cpu, cpu, add, cpu_core_mask); smp.c:651: set_cpus_related(cpu, i, add, cpu_core_mask); smp.c:685: set_cpus_related(cpu, cpu, onlining, mask_fn); smp.c:697: set_cpus_related(cpu, i, onlining, mask_fn); smp.c:721: set_cpus_related(cpu, base + i, onlining, cpu_sibling_mask); smp.c:736: set_cpus_related(cpu, cpu, onlining, cpu_core_mask); smp.c:746: set_cpus_related(cpu, i, onlining, cpu_core_mask); I agree that set_cpus_related() is probably a bad name, make_cpus_related() maybe? > > If we want to push it in that direction I think we should just pass the > set/clear routine instead of the flag, so: > > do_cpus_related(cpu, base + i, cpumask_clear_cpu, cpu_sibling_mask); > > But that might be overdoing it. I think this would be ok. > > So I think we should just do: > > static void set_cpus_related(int i, int j, struct cpumask *(*mask_func)(int)) > { > cpumask_set_cpu(i, mask_func(j)); > cpumask_set_cpu(j, mask_func(i)); > } > > static void clear_cpus_related(int i, int j, struct cpumask > *(*mask_func)(int)) > { > cpumask_clear_cpu(i, mask_func(j)); > cpumask_clear_cpu(j, mask_func(i)); > } > > > So the cases with add become: > > if (add) > set_cpus_related(cpu, i, cpu_core_mask(i)); > else > clear_cpus_related(cpu, i, cpu_core_mask(i)); Dunno, I was trying to get rid of this sort of thing since the logic is duplicated in a lot of places. Seemed to me that it was just pointlessly verbose rather than being helpfully explicit. > > Which is not as pretty but more explicit. > > And the other cases look much better, eg: > > clear_cpus_related(cpu, base + i, cpu_sibling_mask); > > ?? > > cheers
Re: [PATCH 1/5] powerpc/smp: use cpu_to_chip_id() to find siblings
On Wed, Mar 15, 2017 at 10:18 PM, Michael Ellerman wrote: > Oliver O'Halloran writes: > >> To determine which logical CPUs are on the same core the kernel uses the >> ibm,chipid property from the device tree node associated with that cpu. >> The lookup for this this information is currently open coded in both >> traverse_siblings() and traverse_siblings_chip_id(). This patch replaces >> these manual lookups with the existing cpu_to_chip_id() function. > > Some minor nits. > > cpu_to_chip_id() actually searches recursively up the parents until it > finds a ibm,chip-id, so it's not a 1:1 replacement for the existing > logic, but it's probably still an OK conversion. It's still worth > mentioning in the change log thought. fair enough >> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c >> index 893bd7f79be6..dfe0e1d9cd06 100644 >> --- a/arch/powerpc/kernel/smp.c >> +++ b/arch/powerpc/kernel/smp.c >> @@ -664,23 +655,19 @@ static void traverse_core_siblings(int cpu, bool add) >> { >> struct device_node *l2_cache, *np; >> const struct cpumask *mask; >> - int i, chip, plen; >> - const __be32 *prop; >> + int chip_id; >> + int i; >> >> - /* First see if we have ibm,chip-id properties in cpu nodes */ >> - np = of_get_cpu_node(cpu, NULL); >> - if (np) { >> - chip = -1; >> - prop = of_get_property(np, "ibm,chip-id", &plen); >> - if (prop && plen == sizeof(int)) >> - chip = of_read_number(prop, 1); >> - of_node_put(np); >> - if (chip >= 0) { >> - traverse_siblings_chip_id(cpu, add, chip); >> - return; >> - } >> + /* threads that share a chip-id are considered siblings (same die) */ > > You might know it means the "same die", but AFAIK there's no actual > definition for what the chip-id means, so let's not write comments that > might be wrong in future. Just saying they're considered siblings is > sufficient. > > Also "Threads" :) The cpus masks are all built in terms of threads, so this is technically correct even if it sounds stupid. Maybe "logical cpus" would be better? > > cheers
Re: [Patch v5] powerpc/powernv: add hdat attribute to sysfs
On Thu, Mar 2, 2017 at 4:44 PM, Matt Brown wrote: > The HDAT data area is consumed by skiboot and turned into a device-tree. > In some cases we would like to look directly at the HDAT, so this patch > adds a sysfs node to allow it to be viewed. This is not possible through > /dev/mem as it is reserved memory which is stopped by the /dev/mem filter. > This patch also adds sysfs nodes for all properties in the device-tree > under /ibm,opal/firmware/exports. > > Signed-off-by: Matt Brown > --- > Changes between v4 and v5: > - all properties under /ibm,opal/firmware/exports in the device-tree > are now added as new sysfs nodes > - the new sysfs nodes are now placed under /opal/exports > - added a generic read function for all exported attributes > --- > arch/powerpc/platforms/powernv/opal.c | 84 > +++ > 1 file changed, 84 insertions(+) > > diff --git a/arch/powerpc/platforms/powernv/opal.c > b/arch/powerpc/platforms/powernv/opal.c > index 2822935..fbb8264 100644 > --- a/arch/powerpc/platforms/powernv/opal.c > +++ b/arch/powerpc/platforms/powernv/opal.c > @@ -36,6 +36,9 @@ > /* /sys/firmware/opal */ > struct kobject *opal_kobj; > > +/* /sys/firmware/opal/exports */ > +struct kobject *opal_export_kobj; > + > struct opal { > u64 base; > u64 entry; > @@ -604,6 +607,82 @@ static void opal_export_symmap(void) > pr_warn("Error %d creating OPAL symbols file\n", rc); > } > > + > +static int opal_exports_sysfs_init(void) > +{ > + opal_export_kobj = kobject_create_and_add("exports", opal_kobj); > + if (!opal_export_kobj) { > + pr_warn("kobject_create_and_add opal_exports failed\n"); > + return -ENOMEM; > + } > + > + return 0; > +} This can be folded into opal_export_attrs(). > + > +static ssize_t export_attr_read(struct file *fp, struct kobject *kobj, > +struct bin_attribute *bin_attr, char *buf, > +loff_t off, size_t count) > +{ > + return memory_read_from_buffer(buf, count, &off, bin_attr->private, > + bin_attr->size); > +} > + > +static struct bin_attribute *exported_attrs; > +/* > + * opal_export_attrs: creates a sysfs node for each property listed in > + * the device-tree under /ibm,opal/firmware/exports/ > + * All new sysfs nodes are created under /opal/exports/. > + * This allows for reserved memory regions (e.g. HDAT) to be read. > + * The new sysfs nodes are only readable by root. > + */ > +static void opal_export_attrs(void) > +{ > + const __be64 *syms; > + unsigned int size; > + struct device_node *fw; > + struct property *prop; > + int rc; > + int attr_count = 0; > + int n = 0; > + > + fw = of_find_node_by_path("/ibm,opal/firmware/exports"); > + if (!fw) > + return; devicetree nodes are reference counted so when you take a reference to one using of_find_node_* you should use of_put_node() to drop the reference when you're finished with it. Of course, there's plenty of existing code that doesn't do this, but that's no reason to make a bad problem worse ;) > + > + for (prop = fw->properties; prop != NULL; prop = prop->next) > + attr_count++; > + > + if (attr_count > 2) > + exported_attrs = > kmalloc(sizeof(exported_attrs)*(attr_count-2), > + __GFP_IO | __GFP_FS); Why are you using __GFP_IO | __GFP_FS instead of GFP_KERNEL? Also, using kzalloc(), which zeros memory, over kmalloc() is a good idea in general since structures can contain fields that change the behaviour of the function that you pass them to. > + > + > + for_each_property_of_node(fw, prop) { > + > + syms = of_get_property(fw, prop->name, &size); > + > + if (!strcmp(prop->name, "name") || > + !strcmp(prop->name, "phandle")) > + continue; > + > + if (!syms || size != 2 * sizeof(__be64)) > + continue; > + > + (exported_attrs+n)->attr.name = prop->name; References to DT properties are only valid if you have a reference to the DT node that contains them. DT nodes and properties can (in theory) be changed at runtime, but in practice this only really happens for nodes that refer to hotpluggable devices (memory, PCI, etc), but its still poor form to rely on things not happening. You can make a copy of the name with kstrdup() and store that pointer for as long as you like, since you can guarantee the copy will exist until you explicitly free() it. > + (exported_attrs+n)->attr.mode = 0400; > + (exported_attrs+n)->read = export_attr_read; > + (exported_attrs+n)->private = __va(be64_to_cpu(syms[0])); > + (exported_attrs+n)->size = be64_to_cpu(syms[1]); (exported_attrs+n
[PATCH 5/5] powerpc/smp: Add Power9 scheduler topology
In previous generations of Power processors each core had a private L2 cache. The Power9 processor has a slightly different architecture where the L2 cache is shared among pairs of cores rather than being completely private. Making the scheduler aware of this cache sharing allows the scheduler to make more intelligent migration decisions. When one core in the pair is overloaded tasks can be migrated to its paired core to improve throughput without cache-refilling penality typically associated with task migration. Signed-off-by: Oliver O'Halloran --- arch/powerpc/kernel/smp.c | 44 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 5571f30ff72d..5e1811b24415 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -673,7 +673,7 @@ static struct device_node *cpu_to_l2cache(int cpu) return cache; } -static bool update_core_mask_by_l2(int cpu, bool onlining) +static bool update_mask_by_l2(int cpu, bool onlining, struct cpumask *(*mask_fn)(int)) { const struct cpumask *mask = onlining ? cpu_online_mask : cpu_present_mask; struct device_node *l2_cache, *np; @@ -689,7 +689,7 @@ static bool update_core_mask_by_l2(int cpu, bool onlining) continue; if (np == l2_cache) - set_cpus_related(cpu, i, onlining, cpu_core_mask); + set_cpus_related(cpu, i, onlining, mask_fn); of_node_put(np); } @@ -724,10 +724,17 @@ static void update_cpu_masks(int cpu, bool onlining) update_thread_mask(cpu, onlining); + /* we need the l2 cache mask for the power9 scheduler topology */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + update_mask_by_l2(cpu, onlining, cpu_cache_mask); + + /* now build the core mask */ + set_cpus_related(cpu, cpu, onlining, cpu_core_mask); + if (update_core_mask_by_chip_id(cpu, onlining)) return; - if (update_core_mask_by_l2(cpu, onlining)) + if (update_mask_by_l2(cpu, onlining, cpu_core_mask)) return; /* if all else fails duplicate the sibling mask */ @@ -805,6 +812,32 @@ static struct sched_domain_topology_level powerpc_topology[] = { { NULL, }, }; + +/* P9 has a slightly odd architecture where two, four thread cores share an L2 + * cache. For highly threaded workloads it makes sense to try and keep tasks + * inside the pair for better cache utilisation so the scheduler needs to be + * aware of this. */ +static int powerpc_shared_cache_flags(void) +{ + return SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING; +} + +/* this is kind of gross, but passing cpu_cache_mask directly + * causes the build to fail due to incompatible pointer types */ +static inline const struct cpumask *cpu_cache_mask_c(int cpu) +{ + return cpu_cache_mask(cpu); +} + +static struct sched_domain_topology_level power9_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, +#endif + { cpu_cache_mask_c, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + void __init smp_cpus_done(unsigned int max_cpus) { cpumask_var_t old_mask; @@ -829,7 +862,10 @@ void __init smp_cpus_done(unsigned int max_cpus) dump_numa_cpu_topology(); - set_sched_topology(powerpc_topology); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + set_sched_topology(power9_topology); + else + set_sched_topology(powerpc_topology); } -- 2.9.3
[PATCH 4/5] powerpc/smp: add cpu_cache_mask
Traditionally we have only ever tracked which CPUs are in the same core (cpu_sibling_mask) and on the same die (cpu_core_mask). For Power9 we need to be aware of which CPUs share cache with each other so this patch adds cpu_cache_mask and the underlying cpu_cache_map variable to track this. Signed-off-by: Oliver O'Halloran --- arch/powerpc/include/asm/smp.h | 6 ++ arch/powerpc/kernel/smp.c | 5 + 2 files changed, 11 insertions(+) diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 32db16d2e7ad..a7fc3a105d61 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -94,6 +94,7 @@ static inline void set_hard_smp_processor_id(int cpu, int phys) #endif DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DECLARE_PER_CPU(cpumask_var_t, cpu_cache_map); DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); static inline struct cpumask *cpu_sibling_mask(int cpu) @@ -106,6 +107,11 @@ static inline struct cpumask *cpu_core_mask(int cpu) return per_cpu(cpu_core_map, cpu); } +static inline struct cpumask *cpu_cache_mask(int cpu) +{ + return per_cpu(cpu_cache_map, cpu); +} + extern int cpu_to_core_id(int cpu); /* Since OpenPIC has only 4 IPIs, we use slightly different message numbers. diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 3922cace927e..5571f30ff72d 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -72,9 +72,11 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; struct thread_info *secondary_ti; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_cache_map); DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); +EXPORT_PER_CPU_SYMBOL(cpu_cache_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); /* SMP operations for this machine */ @@ -415,6 +417,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus) for_each_possible_cpu(cpu) { zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu), GFP_KERNEL, cpu_to_node(cpu)); + zalloc_cpumask_var_node(&per_cpu(cpu_cache_map, cpu), + GFP_KERNEL, cpu_to_node(cpu)); zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, cpu_to_node(cpu)); /* @@ -428,6 +432,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) } cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid)); + cpumask_set_cpu(boot_cpuid, cpu_cache_mask(boot_cpuid)); cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid)); if (smp_ops && smp_ops->probe) -- 2.9.3
[PATCH 3/5] powerpc/smp: Add update_cpu_masks()
When adding and removing a CPU from the system the per-cpu masks that are used by the scheduler to construct scheduler domains need to be updated to account for the cpu entering or exiting the system. Currently logic this is open-coded for the thread sibling mask and shared for the core mask. This patch moves all the logic for rebuilding these masks into a single function and simplifies the logic which determines which CPUs are within a "core". Signed-off-by: Oliver O'Halloran --- arch/powerpc/kernel/smp.c | 90 --- 1 file changed, 54 insertions(+), 36 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 1c531887ca51..3922cace927e 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -630,14 +630,20 @@ int cpu_first_thread_of_core(int core) } EXPORT_SYMBOL_GPL(cpu_first_thread_of_core); -static void traverse_siblings_chip_id(int cpu, bool add, int chipid) +static bool update_core_mask_by_chip_id(int cpu, bool add) { const struct cpumask *mask = add ? cpu_online_mask : cpu_present_mask; + int chipid = cpu_to_chip_id(cpu); int i; + if (chipid == -1) + return false; + for_each_cpu(i, mask) if (cpu_to_chip_id(i) == chipid) set_cpus_related(cpu, i, add, cpu_core_mask); + + return true; } /* Must be called when no change can occur to cpu_present_mask, @@ -662,42 +668,72 @@ static struct device_node *cpu_to_l2cache(int cpu) return cache; } -static void traverse_core_siblings(int cpu, bool add) +static bool update_core_mask_by_l2(int cpu, bool onlining) { + const struct cpumask *mask = onlining ? cpu_online_mask : cpu_present_mask; struct device_node *l2_cache, *np; - const struct cpumask *mask; - int chip_id; int i; - /* threads that share a chip-id are considered siblings (same die) */ - chip_id = cpu_to_chip_id(cpu); - - if (chip_id >= 0) { - traverse_siblings_chip_id(cpu, add, chip_id); - return; - } - - /* if the chip-id fails then group siblings by the L2 cache */ l2_cache = cpu_to_l2cache(cpu); - mask = add ? cpu_online_mask : cpu_present_mask; + if (l2_cache == NULL) + return false; + for_each_cpu(i, mask) { np = cpu_to_l2cache(i); if (!np) continue; if (np == l2_cache) - set_cpus_related(cpu, i, add, cpu_core_mask); + set_cpus_related(cpu, i, onlining, cpu_core_mask); of_node_put(np); } of_node_put(l2_cache); + + return true; +} + +static void update_thread_mask(int cpu, bool onlining) +{ + int base = cpu_first_thread_sibling(cpu); + int i; + + pr_info("CPUDEBUG: onlining cpu %d, base %d, thread_per_core %d", + cpu, base, threads_per_core); + + for (i = 0; i < threads_per_core; i++) { + /* Threads are onlined one by one. By the final time this +* function is called for the core the sibling mask for each +* thread will be complete, but we need to ensure that offline +* threads aren't touched before they run start_secondary() */ + if (onlining && cpu_is_offline(base + i) && (cpu != base + i)) + continue; + + set_cpus_related(cpu, base + i, onlining, cpu_sibling_mask); + } +} + +static void update_cpu_masks(int cpu, bool onlining) +{ + int i; + + update_thread_mask(cpu, onlining); + + if (update_core_mask_by_chip_id(cpu, onlining)) + return; + + if (update_core_mask_by_l2(cpu, onlining)) + return; + + /* if all else fails duplicate the sibling mask */ + for_each_cpu(i, cpu_sibling_mask(cpu)) + set_cpus_related(cpu, i, onlining, cpu_core_mask); } /* Activate a secondary processor. */ void start_secondary(void *unused) { unsigned int cpu = smp_processor_id(); - int i, base; atomic_inc(&init_mm.mm_count); current->active_mm = &init_mm; @@ -721,19 +757,7 @@ void start_secondary(void *unused) vdso_getcpu_init(); #endif /* Update sibling maps */ - base = cpu_first_thread_sibling(cpu); - for (i = 0; i < threads_per_core; i++) { - if (cpu_is_offline(base + i) && (cpu != base + i)) - continue; - set_cpus_related(cpu, base + i, true, cpu_sibling_mask); - - /* cpu_core_map should be a superset of -* cpu_sibling_map even if we don't have cache -* information, so update the former here, too. -*/ - set_cpus_related(cp
[PATCH 2/5] powerpc/smp: add set_cpus_related()
Add a helper function for updating the per-cpu core and sibling thread cpumasks. This helper just sets (or clears) the relevant bit in the cpumasks each CPU. This is open-coded in several places inside the mask setup code so moving it into a seperate function is a sensible cleanup. Signed-off-by: Oliver O'Halloran --- arch/powerpc/kernel/smp.c | 61 --- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index dfe0e1d9cd06..1c531887ca51 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -377,6 +377,25 @@ static void smp_store_cpu_info(int id) #endif } +/* + * Relationships between CPUs are maintained in a set of per-cpu cpumasks. We + * need to ensure that they are kept consistant between CPUs when they are + * changed. + * + * This is slightly tricky since the core mask must be a strict superset of + * the sibling mask. + */ +static void set_cpus_related(int i, int j, bool related, struct cpumask *(*relation_fn)(int)) +{ + if (related) { + cpumask_set_cpu(i, relation_fn(j)); + cpumask_set_cpu(j, relation_fn(i)); + } else { + cpumask_clear_cpu(i, relation_fn(j)); + cpumask_clear_cpu(j, relation_fn(i)); + } +} + void __init smp_prepare_cpus(unsigned int max_cpus) { unsigned int cpu; @@ -616,17 +635,9 @@ static void traverse_siblings_chip_id(int cpu, bool add, int chipid) const struct cpumask *mask = add ? cpu_online_mask : cpu_present_mask; int i; - for_each_cpu(i, mask) { - if (cpu_to_chip_id(i) == chipid) { - if (add) { - cpumask_set_cpu(cpu, cpu_core_mask(i)); - cpumask_set_cpu(i, cpu_core_mask(cpu)); - } else { - cpumask_clear_cpu(cpu, cpu_core_mask(i)); - cpumask_clear_cpu(i, cpu_core_mask(cpu)); - } - } - } + for_each_cpu(i, mask) + if (cpu_to_chip_id(i) == chipid) + set_cpus_related(cpu, i, add, cpu_core_mask); } /* Must be called when no change can occur to cpu_present_mask, @@ -666,23 +677,17 @@ static void traverse_core_siblings(int cpu, bool add) return; } - /* if the chip-id fails then threads which share L2 cache are */ - + /* if the chip-id fails then group siblings by the L2 cache */ l2_cache = cpu_to_l2cache(cpu); mask = add ? cpu_online_mask : cpu_present_mask; for_each_cpu(i, mask) { np = cpu_to_l2cache(i); if (!np) continue; - if (np == l2_cache) { - if (add) { - cpumask_set_cpu(cpu, cpu_core_mask(i)); - cpumask_set_cpu(i, cpu_core_mask(cpu)); - } else { - cpumask_clear_cpu(cpu, cpu_core_mask(i)); - cpumask_clear_cpu(i, cpu_core_mask(cpu)); - } - } + + if (np == l2_cache) + set_cpus_related(cpu, i, add, cpu_core_mask); + of_node_put(np); } of_node_put(l2_cache); @@ -720,15 +725,13 @@ void start_secondary(void *unused) for (i = 0; i < threads_per_core; i++) { if (cpu_is_offline(base + i) && (cpu != base + i)) continue; - cpumask_set_cpu(cpu, cpu_sibling_mask(base + i)); - cpumask_set_cpu(base + i, cpu_sibling_mask(cpu)); + set_cpus_related(cpu, base + i, true, cpu_sibling_mask); /* cpu_core_map should be a superset of * cpu_sibling_map even if we don't have cache * information, so update the former here, too. */ - cpumask_set_cpu(cpu, cpu_core_mask(base + i)); - cpumask_set_cpu(base + i, cpu_core_mask(cpu)); + set_cpus_related(cpu, base + i, true, cpu_core_mask); } traverse_core_siblings(cpu, true); @@ -818,10 +821,8 @@ int __cpu_disable(void) /* Update sibling maps */ base = cpu_first_thread_sibling(cpu); for (i = 0; i < threads_per_core && base + i < nr_cpu_ids; i++) { - cpumask_clear_cpu(cpu, cpu_sibling_mask(base + i)); - cpumask_clear_cpu(base + i, cpu_sibling_mask(cpu)); - cpumask_clear_cpu(cpu, cpu_core_mask(base + i)); - cpumask_clear_cpu(base + i, cpu_core_mask(cpu)); + set_cpus_related(cpu, base + i, false, cpu_sibling_mask); + set_cpus_related(cpu, base + i, false, cpu_core_mask); } traverse_core_siblings(cpu, false); -- 2.9.3
[PATCH 1/5] powerpc/smp: use cpu_to_chip_id() to find siblings
To determine which logical CPUs are on the same core the kernel uses the ibm,chipid property from the device tree node associated with that cpu. The lookup for this this information is currently open coded in both traverse_siblings() and traverse_siblings_chip_id(). This patch replaces these manual lookups with the existing cpu_to_chip_id() function. Signed-off-by: Oliver O'Halloran --- arch/powerpc/kernel/smp.c | 39 +-- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 893bd7f79be6..dfe0e1d9cd06 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -613,19 +613,11 @@ EXPORT_SYMBOL_GPL(cpu_first_thread_of_core); static void traverse_siblings_chip_id(int cpu, bool add, int chipid) { - const struct cpumask *mask; - struct device_node *np; - int i, plen; - const __be32 *prop; + const struct cpumask *mask = add ? cpu_online_mask : cpu_present_mask; + int i; - mask = add ? cpu_online_mask : cpu_present_mask; for_each_cpu(i, mask) { - np = of_get_cpu_node(i, NULL); - if (!np) - continue; - prop = of_get_property(np, "ibm,chip-id", &plen); - if (prop && plen == sizeof(int) && - of_read_number(prop, 1) == chipid) { + if (cpu_to_chip_id(i) == chipid) { if (add) { cpumask_set_cpu(cpu, cpu_core_mask(i)); cpumask_set_cpu(i, cpu_core_mask(cpu)); @@ -634,7 +626,6 @@ static void traverse_siblings_chip_id(int cpu, bool add, int chipid) cpumask_clear_cpu(i, cpu_core_mask(cpu)); } } - of_node_put(np); } } @@ -664,23 +655,19 @@ static void traverse_core_siblings(int cpu, bool add) { struct device_node *l2_cache, *np; const struct cpumask *mask; - int i, chip, plen; - const __be32 *prop; + int chip_id; + int i; - /* First see if we have ibm,chip-id properties in cpu nodes */ - np = of_get_cpu_node(cpu, NULL); - if (np) { - chip = -1; - prop = of_get_property(np, "ibm,chip-id", &plen); - if (prop && plen == sizeof(int)) - chip = of_read_number(prop, 1); - of_node_put(np); - if (chip >= 0) { - traverse_siblings_chip_id(cpu, add, chip); - return; - } + /* threads that share a chip-id are considered siblings (same die) */ + chip_id = cpu_to_chip_id(cpu); + + if (chip_id >= 0) { + traverse_siblings_chip_id(cpu, add, chip_id); + return; } + /* if the chip-id fails then threads which share L2 cache are */ + l2_cache = cpu_to_l2cache(cpu); mask = add ? cpu_online_mask : cpu_present_mask; for_each_cpu(i, mask) { -- 2.9.3
Re: [PATCH v3] powerpc/powernv: add hdat attribute to sysfs
On Mon, Feb 27, 2017 at 9:56 PM, Michael Ellerman wrote: > Matt Brown writes: >> diff --git a/arch/powerpc/platforms/powernv/opal-hdat.c >> b/arch/powerpc/platforms/powernv/opal-hdat.c >> new file mode 100644 >> index 000..3315dd3 >> --- /dev/null >> +++ b/arch/powerpc/platforms/powernv/opal-hdat.c >> @@ -0,0 +1,65 @@ > ... >> + >> + >> +/* HDAT attribute for sysfs */ >> +static struct bin_attribute hdat_attr = { >> + .attr = {.name = "hdat", .mode = 0444}, > > ajd and oohal report to my office. I don't think there's anything in the HDAT that's sensitive. That said, this might not be true in the future so making it only readable by root might be a good idea. Oliver
Re: [PATCH v3] powerpc/powernv: add hdat attribute to sysfs
On Mon, Feb 27, 2017 at 12:59 PM, Andrew Donnellan wrote: > On 24/02/17 17:20, Matt Brown wrote: >> >> The HDAT data area is consumed by skiboot and turned into a device-tree. >> In some cases we would like to look directly at the HDAT, so this patch >> adds a sysfs node to allow it to be viewed. This is not possible through >> /dev/mem as it is reserved memory which is stopped by the /dev/mem filter. >> >> Signed-off-by: Matt Brown > > > Changes look good, thanks for addressing the comments! Still a couple of > minor points below, otherwise: > > Reviewed-by: Andrew Donnellan > > Stewart: this might need your ACK? > > >> --- >> >> Changes between v2 to v3: >> - fixed header comments >> - simplified if statement >> >> --- >> arch/powerpc/include/asm/opal.h| 1 + >> arch/powerpc/platforms/powernv/Makefile| 1 + >> arch/powerpc/platforms/powernv/opal-hdat.c | 65 >> ++ >> arch/powerpc/platforms/powernv/opal.c | 2 + >> 4 files changed, 69 insertions(+) >> create mode 100644 arch/powerpc/platforms/powernv/opal-hdat.c >> >> diff --git a/arch/powerpc/include/asm/opal.h >> b/arch/powerpc/include/asm/opal.h >> index 5c7db0f..b26944e 100644 >> --- a/arch/powerpc/include/asm/opal.h >> +++ b/arch/powerpc/include/asm/opal.h >> @@ -277,6 +277,7 @@ extern int opal_async_comp_init(void); >> extern int opal_sensor_init(void); >> extern int opal_hmi_handler_init(void); >> extern int opal_event_init(void); >> +extern void opal_hdat_sysfs_init(void); >> >> extern int opal_machine_check(struct pt_regs *regs); >> extern bool opal_mce_check_early_recovery(struct pt_regs *regs); >> diff --git a/arch/powerpc/platforms/powernv/Makefile >> b/arch/powerpc/platforms/powernv/Makefile >> index b5d98cb..9a0c9d6 100644 >> --- a/arch/powerpc/platforms/powernv/Makefile >> +++ b/arch/powerpc/platforms/powernv/Makefile >> @@ -3,6 +3,7 @@ obj-y += opal-rtc.o opal-nvram.o >> opal-lpc.o opal-flash.o >> obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o >> opal-sensor.o >> obj-y += opal-msglog.o opal-hmi.o opal-power.o >> opal-irqchip.o >> obj-y += opal-kmsg.o >> +obj-y += opal-hdat.o > > > Normally we keep putting new object files on the same line until it gets > long enough that we have to break it. This is very minor though :) > > >> >> obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o >> obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o >> diff --git a/arch/powerpc/platforms/powernv/opal-hdat.c >> b/arch/powerpc/platforms/powernv/opal-hdat.c >> new file mode 100644 >> index 000..3315dd3 >> --- /dev/null >> +++ b/arch/powerpc/platforms/powernv/opal-hdat.c >> @@ -0,0 +1,65 @@ >> +/* >> + * PowerNV OPAL HDAT interface >> + * >> + * Author: Matt Brown >> + * >> + * Copyright 2017 IBM Corp. >> + * >> + * This program is free software; you can redistribute it and/or >> + * modify it under the terms of the GNU General Public License >> + * as published by the Free Software Foundation; either version >> + * 2 of the License, or (at your option) any later version. >> + */ >> + >> +#include >> +#include >> +#include >> +#include >> + >> +struct hdat_info { >> + char *base; >> + u64 size; >> +}; >> + >> +static struct hdat_info hdat_inf; > > > As Oliver pointed out, we could do with a better name than hdat_inf - it's > only one character away from the name of the struct type. Hmm, perhaps > "hdat_location", or maybe Oliver has a better suggestion. I'm not that bothered by it. Reviewed-by: Oliver O'Halloran > > > -- > Andrew Donnellan OzLabs, ADL Canberra > andrew.donnel...@au1.ibm.com IBM Australia Limited >
Re: [RFC] Remove memory from nodes for memtrace.
On Thu, Feb 23, 2017 at 8:39 AM, Rashmica Gupta wrote: > Some powerpc hardware features may want to gain access to a > chunk of undisturbed real memory. This update provides a means to unplug > said memory from the kernel with a set of sysfs calls. By writing an integer > containing the size of memory to be unplugged into > /sys/kernel/debug/powerpc/memtrace/enable, the code will remove that much > memory from the end of each available chip's memory space. In addition, the > means to read out the contents of the unplugged memory is also provided by > reading out the /sys/kernel/debug/powerpc/memtrace//dump file. > > Signed-off-by: Rashmica Gupta > --- > Written by Douglas Lehr . > Have tested and seems to work as I would expect. Only change I have made from > the original is to check that the value being written to the debugfs file is > not 0 (or obscenely large), as otherwise you get a nice kernel oops where the > kernel attempts to access data at 0xfffe0. > > Thoughts about doing this with hot unplug or other changes? > > arch/powerpc/mm/hash_native_64.c | 39 +++- > arch/powerpc/platforms/powernv/Makefile | 1 + > arch/powerpc/platforms/powernv/memtrace.c | 285 > ++ > 3 files changed, 321 insertions(+), 4 deletions(-) > create mode 100644 arch/powerpc/platforms/powernv/memtrace.c > > diff --git a/arch/powerpc/mm/hash_native_64.c > b/arch/powerpc/mm/hash_native_64.c > index cc33260..44cc6ce 100644 > --- a/arch/powerpc/mm/hash_native_64.c > +++ b/arch/powerpc/mm/hash_native_64.c > @@ -3,7 +3,7 @@ > * > * SMP scalability work: > *Copyright (C) 2001 Anton Blanchard , IBM > - * > + * > * This program is free software; you can redistribute it and/or > * modify it under the terms of the GNU General Public License > * as published by the Free Software Foundation; either version > @@ -181,7 +181,7 @@ static inline void native_lock_hpte(struct hash_pte > *hptep) > while (1) { > if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) > break; > - while(test_bit(HPTE_LOCK_BIT, word)) > + while (test_bit(HPTE_LOCK_BIT, word)) > cpu_relax(); > } > } > @@ -208,10 +208,10 @@ static long native_hpte_insert(unsigned long > hpte_group, unsigned long vpn, > } > > for (i = 0; i < HPTES_PER_GROUP; i++) { > - if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) { > + if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID)) { > /* retry with lock held */ > native_lock_hpte(hptep); > - if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) > + if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID)) > break; > native_unlock_hpte(hptep); > } > @@ -407,6 +407,36 @@ static void native_hpte_updateboltedpp(unsigned long > newpp, unsigned long ea, > tlbie(vpn, psize, psize, ssize, 0); > } > > +/* > + * Remove a bolted kernel entry. Memory hotplug uses this. > + * > + * No need to lock here because we should be the only user. > + */ > +static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) > +{ > + unsigned long vpn; > + unsigned long vsid; > + long slot; > + struct hash_pte *hptep; > + > + vsid = get_kernel_vsid(ea, ssize); > + vpn = hpt_vpn(ea, vsid, ssize); > + > + slot = native_hpte_find(vpn, psize, ssize); > + if (slot == -1) > + return -ENOENT; > + > + hptep = htab_address + slot; > + > + /* Invalidate the hpte */ > + hptep->v = 0; > + > + /* Invalidate the TLB */ > + tlbie(vpn, psize, psize, ssize, 0); > + return 0; > +} > + > + > static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, >int bpsize, int apsize, int ssize, int > local) > { > @@ -722,6 +752,7 @@ void __init hpte_init_native(void) > mmu_hash_ops.hpte_invalidate= native_hpte_invalidate; > mmu_hash_ops.hpte_updatepp = native_hpte_updatepp; > mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp; > + mmu_hash_ops.hpte_removebolted = native_hpte_removebolted; > mmu_hash_ops.hpte_insert= native_hpte_insert; > mmu_hash_ops.hpte_remove= native_hpte_remove; > mmu_hash_ops.hpte_clear_all = native_hpte_clear; > diff --git a/arch/powerpc/platforms/powernv/Makefile > b/arch/powerpc/platforms/powernv/Makefile > index b5d98cb..2026661 100644 > --- a/arch/powerpc/platforms/powernv/Makefile > +++ b/arch/powerpc/platforms/powernv/Makefile > @@ -11,4 +11,5 @@ obj-$(CONFIG_EEH) += eeh-powernv.o > obj-$(CONFIG_PPC_SCOM) += opal-xscom.o > obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o > obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o > +obj-$(CONFI
Re: [PATCH] powerpc/powernv: add hdat attribute to sysfs
On Thu, Feb 23, 2017 at 1:29 PM, Matt Brown wrote: > From: Matt Brown > > The HDAT data area is consumed by skiboot and turned into a device-tree. > In some cases we would like to look directly at the HDAT, so this patch > adds a sysfs node to allow it to be viewed. This is not possible through > /dev/mem as it is reserved memory which is stopped by the /dev/mem filter. > > Signed-off-by: Matt Brown > --- > arch/powerpc/include/asm/opal.h | 1 + > arch/powerpc/platforms/powernv/opal-msglog.c | 49 > > arch/powerpc/platforms/powernv/opal.c| 2 ++ > 3 files changed, 52 insertions(+) > > diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h > index 5c7db0f..b26944e 100644 > --- a/arch/powerpc/include/asm/opal.h > +++ b/arch/powerpc/include/asm/opal.h > @@ -277,6 +277,7 @@ extern int opal_async_comp_init(void); > extern int opal_sensor_init(void); > extern int opal_hmi_handler_init(void); > extern int opal_event_init(void); > +extern void opal_hdat_sysfs_init(void); > > extern int opal_machine_check(struct pt_regs *regs); > extern bool opal_mce_check_early_recovery(struct pt_regs *regs); > diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c > b/arch/powerpc/platforms/powernv/opal-msglog.c > index 39d6ff9..a637055 100644 > --- a/arch/powerpc/platforms/powernv/opal-msglog.c > +++ b/arch/powerpc/platforms/powernv/opal-msglog.c > @@ -31,7 +31,13 @@ struct memcons { > __be32 in_cons; > }; > > +struct hdatInfo { > + char *base; > + u64 size; > +}; > + > static struct memcons *opal_memcons = NULL; > +static struct hdatInfo hdat_inf; I have a few 'o's to spare if you need one. > > ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) > { > @@ -136,3 +142,46 @@ void __init opal_msglog_sysfs_init(void) > if (sysfs_create_bin_file(opal_kobj, &opal_msglog_attr) != 0) > pr_warn("OPAL: sysfs file creation failed\n"); > } > + > + > + > +/* Read function for HDAT attribute in sysfs */ Bonus whitespace! > +static ssize_t hdat_read(struct file *file, struct kobject *kobj, > +struct bin_attribute *bin_attr, char *to, > +loff_t pos, size_t count) > +{ > + if (!hdat_inf.base) > + return -ENODEV; > + > + return memory_read_from_buffer(to, count, &pos, hdat_inf.base, > + hdat_inf.size); > +} Hmm... There's been some ideas floating around about removing Skiboot from the linear mapping and that would break this. However, that is something we should probably shouldn't worry about until it happens. > + > + > +/* HDAT attribute for sysfs */ > +static struct bin_attribute hdat_attr = { > + .attr = {.name = "hdat", .mode = 0444}, > + .read = hdat_read > +}; > + > +void __init opal_hdat_sysfs_init(void) > +{ > + u64 hdatAddr[2]; > + > + /* Check for the hdat-map prop in device-tree */ > + if (of_property_read_u64_array(opal_node, "hdat-map", hdatAddr, 2)) { > + pr_debug("OPAL: Property hdat-map not found.\n"); > + return; > + } > + > + /* Print out hdat-map values. [0]: base, [1]: size */ > + pr_debug("HDAT Base address: %#llx\n", hdatAddr[0]); > + pr_debug("HDAT Size: %#llx\n", hdatAddr[1]); > + > + hdat_inf.base = phys_to_virt(hdatAddr[0]); > + hdat_inf.size = hdatAddr[1]; > + > + if (sysfs_create_bin_file(opal_kobj, &hdat_attr) != 0) > + pr_debug("OPAL: sysfs file creation for HDAT failed"); > + > +} > diff --git a/arch/powerpc/platforms/powernv/opal.c > b/arch/powerpc/platforms/powernv/opal.c > index 2822935..cae3745 100644 > --- a/arch/powerpc/platforms/powernv/opal.c > +++ b/arch/powerpc/platforms/powernv/opal.c > @@ -740,6 +740,8 @@ static int __init opal_init(void) > opal_sys_param_init(); > /* Setup message log sysfs interface. */ > opal_msglog_sysfs_init(); > + /* Create hdat object under sys/firmware/opal */ > + opal_hdat_sysfs_init(); > } > > /* Initialize platform devices: IPMI backend, PRD & flash interface */ > -- > 2.9.3 > Quibbling aside, look ok. Reviewed-by: Oliver O'Halloran
Re: [PowerPC] 4.10.0 fails to build on BE config
On Tue, Feb 21, 2017 at 6:25 PM, abdul wrote: > Hi, > > Today's mainline build, breaks on Power6 and Power7 (all BE config) with > these build errors > > arch/powerpc/kernel/time.c: In function ‘running_clock’: > arch/powerpc/kernel/time.c:712:2: error: implicit declaration of function > ‘cputime_to_nsecs’ [-Werror=implicit-function-declaration] > return local_clock() - > cputime_to_nsecs(kcpustat_this_cpu->cpustat[CPUTIME_STEAL]); > ^ > cc1: some warnings being treated as errors > make[1]: *** [arch/powerpc/kernel/time.o] Error 1 > > > Regard's > Abdul Haleem > IBM Linux Technology Center. Hi Abdul, Are there any extra patches in your tree? I briefly tried to reproduce this, but in my local tree this line: > return local_clock() - > cputime_to_nsecs(kcpustat_this_cpu->cpustat[CPUTIME_STEAL]); Is at time.c:692 rather than time.c:712 Oliver
Re: [PATCH] powerpc: Use octal numbers for file permissions
It has been pointed out that this actually occured in 2017. My apologies. On 17/01/2017 9:50 PM, "Oliver O'Halloran" wrote: > "It's possible I missed one, but I did genuinely review all of it" > > Cyril Bur, 2016 > In a hobart pub, specifically The Winston > > On 17/01/2017 8:53 PM, "Michael Ellerman" wrote: > >> Cyril Bur writes: >> >> > On Thu, 2017-01-12 at 14:54 +1100, Russell Currey wrote: >> >> Symbolic macros are unintuitive and hard to read, whereas octal >> constants >> >> are much easier to interpret. Replace macros for the basic permission >> >> flags (user/group/other read/write/execute) with numeric constants >> >> instead, across the whole powerpc tree. >> >> >> >> Introducing a significant number of changes across the tree for no >> runtime >> >> benefit isn't exactly desirable, but so long as these macros are still >> >> used in the tree people will keep sending patches that add them. Not >> only >> >> are they hard to parse at a glance, there are multiple ways of coming >> to >> >> the same value (as you can see with 0444 and 0644 in this patch) which >> >> hurts readability. >> >> >> >> Signed-off-by: Russell Currey >> > >> > Reviewed-by: Cyril Bur >> >> Did you really really review every single change? >> >> Because if you did then I don't have to, and that would be *great* :) >> >> cheers >> >
Re: [PATCH] powerpc: Use octal numbers for file permissions
"It's possible I missed one, but I did genuinely review all of it" Cyril Bur, 2016 In a hobart pub, specifically The Winston On 17/01/2017 8:53 PM, "Michael Ellerman" wrote: > Cyril Bur writes: > > > On Thu, 2017-01-12 at 14:54 +1100, Russell Currey wrote: > >> Symbolic macros are unintuitive and hard to read, whereas octal > constants > >> are much easier to interpret. Replace macros for the basic permission > >> flags (user/group/other read/write/execute) with numeric constants > >> instead, across the whole powerpc tree. > >> > >> Introducing a significant number of changes across the tree for no > runtime > >> benefit isn't exactly desirable, but so long as these macros are still > >> used in the tree people will keep sending patches that add them. Not > only > >> are they hard to parse at a glance, there are multiple ways of coming to > >> the same value (as you can see with 0444 and 0644 in this patch) which > >> hurts readability. > >> > >> Signed-off-by: Russell Currey > > > > Reviewed-by: Cyril Bur > > Did you really really review every single change? > > Because if you did then I don't have to, and that would be *great* :) > > cheers >
Re: [PATCH v5 2/5] powernv:stop: Uniformly rename power9 to arch300
On Fri, Jan 13, 2017 at 2:44 PM, Gautham R Shenoy wrote: > On Thu, Jan 12, 2017 at 03:17:33PM +0530, Balbir Singh wrote: >> On Tue, Jan 10, 2017 at 02:37:01PM +0530, Gautham R. Shenoy wrote: >> > From: "Gautham R. Shenoy" >> > >> > Balbir pointed out that in idle_book3s.S and powernv/idle.c some >> > functions and variables had power9 in their names while some others >> > had arch300. >> > >> >> I would prefer power9 to arch300 >> > > > I don't have a strong preference for arch300 vs power9, will change it > to power9 if that looks better. Personally I think we should be as descriptive as possible and use power_9_arch_300_the_bikeshed_is_red_dammit. Oliver
[RFC PATCH] powerpc/powernv: report error messages from opal
Recent versions of skiboot will raise an OPAL event (read: interrupt) when firmware writes an error message to its internal console. In conjunction they provide an OPAL call that the kernel can use to extract these messages from the OPAL log to allow them to be written into the kernel's log buffer where someone will (hopefully) look at them. For the companion skiboot patches see: https://lists.ozlabs.org/pipermail/skiboot/2016-December/005861.html Signed-off-by: Oliver O'Halloran --- arch/powerpc/include/asm/opal-api.h| 5 +++- arch/powerpc/include/asm/opal.h| 1 + arch/powerpc/platforms/powernv/opal-msglog.c | 41 ++ arch/powerpc/platforms/powernv/opal-wrappers.S | 1 + 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 0e2e57bcab50..cb9c0e6afb33 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -167,7 +167,8 @@ #define OPAL_INT_EOI 124 #define OPAL_INT_SET_MFRR 125 #define OPAL_PCI_TCE_KILL 126 -#define OPAL_LAST 126 +#define OPAL_SCRAPE_LOG128 +#define OPAL_LAST 128 /* Device tree flags */ @@ -288,6 +289,7 @@ enum OpalPendingState { OPAL_EVENT_PCI_ERROR = 0x200, OPAL_EVENT_DUMP_AVAIL = 0x400, OPAL_EVENT_MSG_PENDING = 0x800, + OPAL_EVENT_LOG_PENDING = 0x1000, }; enum OpalThreadStatus { @@ -406,6 +408,7 @@ enum opal_msg_type { OPAL_MSG_DPO= 5, OPAL_MSG_PRD= 6, OPAL_MSG_OCC= 7, + OPAL_MSG_LOG= 8, OPAL_MSG_TYPE_MAX, }; diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 5c7db0f1a708..2b3bd3219fb4 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -232,6 +232,7 @@ int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t kill_type, int64_t opal_rm_pci_tce_kill(uint64_t phb_id, uint32_t kill_type, uint32_t pe_num, uint32_t tce_size, uint64_t dma_addr, uint32_t npages); +int64_t opal_scrape_log(int64_t *offset, char *buf, int64_t len, int64_t *lvl); /* Internal functions */ extern int early_init_dt_scan_opal(unsigned long node, const char *uname, diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c index 39d6ff9e5630..78168f66fb24 100644 --- a/arch/powerpc/platforms/powernv/opal-msglog.c +++ b/arch/powerpc/platforms/powernv/opal-msglog.c @@ -15,6 +15,7 @@ #include #include #include +#include /* OPAL in-memory console. Defined in OPAL source at core/console.c */ struct memcons { @@ -102,8 +103,36 @@ static struct bin_attribute opal_msglog_attr = { .read = opal_msglog_read }; +static char *log_levels[] = { "Emergency", "Alert", "Critical", "Error", "Warning" }; +static int64_t offset = -1; + +static irqreturn_t opal_print_log(int irq, void *data) +{ + int64_t rc, log_lvl; + char buffer[320]; + + /* +* only print one message per invokation of the IRQ handler +*/ + + rc = opal_scrape_log(&offset, buffer, sizeof(buffer), &log_lvl); + + if (rc == OPAL_SUCCESS || rc == OPAL_PARTIAL) { + log_lvl = be64_to_cpu(log_lvl); + if (log_lvl > 4) + log_lvl = 4; + + printk_emit(0, log_lvl, NULL, 0, "OPAL %s: %s%s\r\n", + log_levels[log_lvl], buffer, + rc == OPAL_PARTIAL ? "" : ""); + } + + return IRQ_HANDLED; +} + void __init opal_msglog_init(void) { + int virq, rc = -1; u64 mcaddr; struct memcons *mc; @@ -123,6 +152,18 @@ void __init opal_msglog_init(void) return; } + virq = opal_event_request(ilog2(OPAL_EVENT_LOG_PENDING)); + if (virq) { + rc = request_irq(virq, opal_print_log, + IRQF_TRIGGER_HIGH, "opal memcons", NULL); + + if (rc) + irq_dispose_mapping(virq); + } + + if (!virq || rc) + pr_warn("Unable to register OPAL log event handler\n"); + opal_memcons = mc; } diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 3aa40f1b20f5..c59d7da3fd1a 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -312,3 +312,4 @@ OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); OPAL_CALL_REAL(opal_rm_int_set_mfrr, OPA
[PATCH] powerpc/time: clear LPCR.LD when unneeded
Currently the kernel will enable LD mode at boot when required. However, when using kexec the second kernel may not want to have the LD enabled. This patch ensures the second kernel will explicitly clear the LD flag when not required by the current kernel. Signed-off-by: Oliver O'Halloran --- arch/powerpc/kernel/time.c | 12 +--- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index be9751f1cb2a..816700e8a475 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -925,18 +925,16 @@ static void register_decrementer_clockevent(int cpu) static void enable_large_decrementer(void) { - if (!cpu_has_feature(CPU_FTR_ARCH_300)) - return; - - if (decrementer_max <= DECREMENTER_DEFAULT_MAX) - return; - /* * If we're running as the hypervisor we need to enable the LD manually * otherwise firmware should have done it for us. */ - if (cpu_has_feature(CPU_FTR_HVMODE)) + if (decrementer_max > DECREMENTER_DEFAULT_MAX + && cpu_has_feature(CPU_FTR_HVMODE) + && cpu_has_feature(CPU_FTR_ARCH_300)) mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_LD); + else + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_LD); } static void __init set_decrementer_max(void) -- 2.7.4
Re: [RFC][PATCH] powerpc/64be: use ELFv2 ABI for big endian kernels
On Thu, Nov 24, 2016 at 1:38 AM, Segher Boessenkool wrote: > On Thu, Nov 24, 2016 at 12:08:40AM +1100, Nicholas Piggin wrote: >> Question, are there any fundamental reasons we shouldn't use the ELFv2 >> ABI to build big endian kernels if the compiler supports it? > > No one uses ELFv2 for BE in production, and it isn't thoroughly tested > at all, not even regularly tested. "Not supported", as far as GCC is > concerned (or any of the distros AFAIK). Is this actually unsupported by gcc? The ppc64 musl libc port is ABI v2 only so they use it on BE too. Buildroot forces ABI v2 to be used for all of userspace when musl is selected as the libc for this reason so it's not completely used in the wild. It's still pretty niche though...
[PATCH] powerpc/boot: fix the early OPAL console wrappers
When configured with CONFIG_PPC_EARLY_DEBUG_OPAL=y the kernel expects the OPAL entry and base addresses to be passed in r8 and r9 respectively. Currently the wrapper does not attempt to restore these values before entering the decompressed kernel which causes the kernel to branch into whatever happens to be in r9 when doing a write to the OPAL console in early boot. This patch adds a platform_ops hook that can be used to branch into the new kernel. The OPAL console driver patches this at runtime so that if the console is used it will be restored just prior to entering the kernel. Fixes: 656ad58ef19e Signed-off-by: Oliver O'Halloran --- arch/powerpc/boot/main.c | 8 ++-- arch/powerpc/boot/opal-calls.S | 13 + arch/powerpc/boot/opal.c | 11 +++ arch/powerpc/boot/ops.h| 1 + 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/boot/main.c b/arch/powerpc/boot/main.c index d80161b633f4..60522d22a428 100644 --- a/arch/powerpc/boot/main.c +++ b/arch/powerpc/boot/main.c @@ -217,8 +217,12 @@ void start(void) console_ops.close(); kentry = (kernel_entry_t) vmlinux.addr; - if (ft_addr) - kentry(ft_addr, 0, NULL); + if (ft_addr) { + if(platform_ops.kentry) + platform_ops.kentry(ft_addr, vmlinux.addr); + else + kentry(ft_addr, 0, NULL); + } else kentry((unsigned long)initrd.addr, initrd.size, loader_info.promptr); diff --git a/arch/powerpc/boot/opal-calls.S b/arch/powerpc/boot/opal-calls.S index ff2f1b97bc53..2a99fc9a3ccf 100644 --- a/arch/powerpc/boot/opal-calls.S +++ b/arch/powerpc/boot/opal-calls.S @@ -12,6 +12,19 @@ .text + .globl opal_kentry +opal_kentry: + /* r3 is the fdt ptr */ + mtctr r4 + li r4, 0 + li r5, 0 + li r6, 0 + li r7, 0 + ld r11,opal@got(r2) + ld r8,0(r11) + ld r9,8(r11) + bctr + #define OPAL_CALL(name, token) \ .globl name;\ name: \ diff --git a/arch/powerpc/boot/opal.c b/arch/powerpc/boot/opal.c index 1f37e1c1d6d8..d7b4fd47eb44 100644 --- a/arch/powerpc/boot/opal.c +++ b/arch/powerpc/boot/opal.c @@ -23,14 +23,25 @@ struct opal { static u32 opal_con_id; +/* see opal-wrappers.S */ int64_t opal_console_write(int64_t term_number, u64 *length, const u8 *buffer); int64_t opal_console_read(int64_t term_number, uint64_t *length, u8 *buffer); int64_t opal_console_write_buffer_space(uint64_t term_number, uint64_t *length); int64_t opal_console_flush(uint64_t term_number); int64_t opal_poll_events(uint64_t *outstanding_event_mask); +void opal_kentry(unsigned long fdt_addr, void *vmlinux_addr); + static int opal_con_open(void) { + /* +* When OPAL loads the boot kernel it stashes the OPAL base and entry +* address in r8 and r9 so the kernel can use the OPAL console +* before unflattening the devicetree. While executing the wrapper will +* probably trash r8 and r9 so this kentry hook restores them before +* entering the decompressed kernel. +*/ + platform_ops.kentry = opal_kentry; return 0; } diff --git a/arch/powerpc/boot/ops.h b/arch/powerpc/boot/ops.h index e19b64ef977a..deeae6f6ba9c 100644 --- a/arch/powerpc/boot/ops.h +++ b/arch/powerpc/boot/ops.h @@ -30,6 +30,7 @@ struct platform_ops { void * (*realloc)(void *ptr, unsigned long size); void(*exit)(void); void * (*vmlinux_alloc)(unsigned long size); + void(*kentry)(unsigned long fdt_addr, void *vmlinux_addr); }; extern struct platform_ops platform_ops; -- 2.5.5
Re: [PATCH v2 2/3] cpuidle:powernv: Add helper function to populate powernv idle states.
s[i] & OPAL_PM_STOP_INST_FAST) && > !(flags[i] & OPAL_PM_TIMEBASE_STOP)) { > - strncpy(powernv_states[nr_idle_states].name, > - names[i], CPUIDLE_NAME_LEN); > - strncpy(powernv_states[nr_idle_states].desc, > - names[i], CPUIDLE_NAME_LEN); > - powernv_states[nr_idle_states].flags = 0; > - > - powernv_states[nr_idle_states].enter = stop_loop; > - stop_psscr_table[nr_idle_states] = psscr_val[i]; > + add_powernv_state(nr_idle_states, names[i], > + CPUIDLE_FLAG_NONE, stop_loop, > + target_residency, exit_latency, > + psscr_val[i]); > } > > /* > @@ -274,32 +300,20 @@ static int powernv_add_idle_states(void) > #ifdef CONFIG_TICK_ONESHOT > if (flags[i] & OPAL_PM_SLEEP_ENABLED || > flags[i] & OPAL_PM_SLEEP_ENABLED_ER1) { > + target_residency = 30; Same comment as above. > /* Add FASTSLEEP state */ > - strcpy(powernv_states[nr_idle_states].name, > "FastSleep"); > - strcpy(powernv_states[nr_idle_states].desc, > "FastSleep"); > - powernv_states[nr_idle_states].flags = > CPUIDLE_FLAG_TIMER_STOP; > - powernv_states[nr_idle_states].target_residency = > 30; > - powernv_states[nr_idle_states].enter = fastsleep_loop; > + add_powernv_state(nr_idle_states, "FastSleep", > + CPUIDLE_FLAG_TIMER_STOP, > + fastsleep_loop, > + target_residency, exit_latency, 0); > } else if ((flags[i] & OPAL_PM_STOP_INST_DEEP) && > (flags[i] & OPAL_PM_TIMEBASE_STOP)) { > - strncpy(powernv_states[nr_idle_states].name, > - names[i], CPUIDLE_NAME_LEN); > - strncpy(powernv_states[nr_idle_states].desc, > - names[i], CPUIDLE_NAME_LEN); > - > - powernv_states[nr_idle_states].flags = > CPUIDLE_FLAG_TIMER_STOP; > - powernv_states[nr_idle_states].enter = stop_loop; > - stop_psscr_table[nr_idle_states] = psscr_val[i]; > + add_powernv_state(nr_idle_states, names[i], > + CPUIDLE_FLAG_TIMER_STOP, stop_loop, > + target_residency, exit_latency, > + psscr_val[i]); > } > #endif > - powernv_states[nr_idle_states].exit_latency = > - ((unsigned int)latency_ns[i]) / 1000; > - > - if (!rc) { > - powernv_states[nr_idle_states].target_residency = > - ((unsigned int)residency_ns[i]) / 1000; > - } > - > nr_idle_states++; > } > out: > diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h > index bb31373..c4e10f8 100644 > --- a/include/linux/cpuidle.h > +++ b/include/linux/cpuidle.h > @@ -62,6 +62,7 @@ struct cpuidle_state { > }; > > /* Idle State Flags */ > +#define CPUIDLE_FLAG_NONE (0x00) > #define CPUIDLE_FLAG_COUPLED (0x02) /* state applies to multiple cpus */ > #define CPUIDLE_FLAG_TIMER_STOP (0x04) /* timer is stopped on this state */ > > -- > 1.9.4 > Looks good otherwise. Reviewed-by: Oliver O'Halloran
[PATCH] powerpc/powernv: de-deuplicate OPAL call wrappers
Currently the code to perform an OPAL call is duplicated between the normal path and path taken when tracepoints are enabled. There's no real need for this and combining them makes opal_tracepoint_entry considerably easier to understand. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/opal-wrappers.S | 44 ++ 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 44d2d842cee7..3ebe0db7ffeb 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -50,18 +50,14 @@ END_FTR_SECTION(0, 1); \ #define OPAL_BRANCH(LABEL) #endif -/* TODO: +/* + * DO_OPAL_CALL assumes: * - * - Trace irqs in/off (needs saving/restoring all args, argh...) - * - Get r11 feed up by Dave so I can have better register usage + * r0 = OPAL call token + * LR has been saved on the stack */ -#define OPAL_CALL(name, token) \ - _GLOBAL_TOC(name);\ - mflrr0; \ - std r0,PPC_LR_STKOFF(r1); \ - li r0,token; \ - OPAL_BRANCH(opal_tracepoint_entry) \ +#define DO_OPAL_CALL() \ mfcrr12;\ stw r12,8(r1); \ li r11,0; \ @@ -81,6 +77,14 @@ END_FTR_SECTION(0, 1); \ mtspr SPRN_HSRR0,r12; \ hrfid +#define OPAL_CALL(name, token) \ + _GLOBAL_TOC(name);\ + mflrr0; \ + std r0,PPC_LR_STKOFF(r1); \ + li r0,token; \ + OPAL_BRANCH(opal_tracepoint_entry) \ + DO_OPAL_CALL() + opal_return: /* * Fixup endian on OPAL return... we should be able to simplify @@ -122,26 +126,12 @@ opal_tracepoint_entry: ld r8,STK_REG(R29)(r1) ld r9,STK_REG(R30)(r1) ld r10,STK_REG(R31)(r1) + + /* return from the opal call via tracepoint_return */ LOAD_REG_ADDR(r11,opal_tracepoint_return) - mfcrr12 std r11,16(r1) - stw r12,8(r1) - li r11,0 - mfmsr r12 - ori r11,r11,MSR_EE - std r12,PACASAVEDMSR(r13) - andcr12,r12,r11 - mtmsrd r12,1 - LOAD_REG_ADDR(r11,opal_return) - mtlrr11 - li r11,MSR_DR|MSR_IR|MSR_LE - andcr12,r12,r11 - mtspr SPRN_HSRR1,r12 - LOAD_REG_ADDR(r11,opal) - ld r12,8(r11) - ld r2,0(r11) - mtspr SPRN_HSRR0,r12 - hrfid + + DO_OPAL_CALL() opal_tracepoint_return: std r3,STK_REG(R31)(r1) -- 2.5.5
Re: Commit 1b7898ee276b "powerpc/boot: Use the pre-boot decompression API" breaks boot
On Tue, Oct 11, 2016 at 7:06 AM, Heiner Kallweit wrote: >> IMHO in case of using cuboot no CONFIG_KERNEL_ config option >> should be set and Makefile + code in arch/powerpc/boot should be able >> to deal with this situation: >> - don't copy and build the decompression stuff >> - use an alternative version of prep_kernel() in main.c which doesn't >> attempt to decompress the kernel image >> >> This should be a cleaner solution than probing the kernel image whether >> it's compressed or not. >> > > This would be the patch implementing the idea. Advantage is that all > the unnecessary decompression code isn't built. Works fine for me. I don't think this approach is viable. The wrapper code is shared among the various output image formats some of which *will* contain a compressed kernel image so we can't simply remove the decompressor from the wrapper. A random example I found in the makefile was CONFIG_BAMBOO: > image-$(CONFIG_BAMBOO) += treeImage.bamboo cuImage.bamboo When building for this platform Kbuild will produce treeboot and a cuboot image. Unlike uboot, Treeboot doesn't do any decompression so the wrapper needs to decompress the kernel itself. The probing solution more or less matches the old behaviour (which we know works) so I think we should just stick with that. - Oliver
Re: Commit 1b7898ee276b "powerpc/boot: Use the pre-boot decompression API" breaks boot
On Mon, Oct 10, 2016 at 3:41 PM, Michael Ellerman wrote: > Heiner Kallweit writes: > >> Am 07.10.2016 um 21:26 schrieb Heiner Kallweit: >>> Am 07.10.2016 um 07:51 schrieb Oliver O'Halloran: >>>> Hi, Heiner >>>> >>>> Could you send me a copy of the kernel .config (or which defconfig) >>>> that you're using, the name of the HW platform that you're using and >>>> if possible the kernel image itself? >>>> >>>> Thanks, >>>> Oliver >>>> >>> Thanks for the quick reply. Attached are .config and cuImage. >>> HW is a TP-Link TL-WDR4900 WiFi router (P1014-based) running OpenWRT. >>> >> After further checking I think I found the issue. The old gunzip code >> handled uncompressed data transparently whilst the new one bails out >> if it doesn't find a proper gzip header. >> And in my case the actual kernel image is uncompressed. >> With the following patch the system boots fine again (at least for me). > > Thanks for testing and tracking it down. Yeah thanks for that. I was putting off looking at it until Monday :) > > I wonder why the actual image is uncompressed? Or alternately why do we > tell uboot the image is compressed when it's not? The uboot payload (wrapper, kernel, initrd) as a whole is compressed as a single blob. Modern uboot can just decompress the payload and jump straight into the kernel and I'd assumed that all uboot platforms did this. The problem is that the compatible uboot (cuboot) images do use the wrapper and the vmlinux baked into the wrapper is uncompressed. Oliver
Re: [PATCH] Fix "ibm,processor-radix-AP-encodings"
On Wed, Sep 28, 2016 at 12:43 PM, Aneesh Kumar K.V wrote: > Balbir Singh writes: > >> The top 3 bits of the lower order byte should contain the >> AP encoding, we assume the top 3 bits of the MSB. Balbir, could you reword this so it says "Currently we wrongly assume " or similar. The current commit message made me think you were changing it to look at the top 3 bits of the MSB rather than changing it look at the LSB. > Are you sure, Power architecture documents always confuse about MSB vs > lowe order bytes. ? PAPR seems to be pretty consistent about "low order" meaning "least significant." Additionally the PAPR that describes ibm,processor-radix-AP-encodings says that it is formatted this way so it can be used when constructing the register argument to tlbie. The modes of tlbie that use the AP field place it in bits 56:59 so I think Balbir's fix is correct. Reviewed-By: Oliver O'Halloran
Re: Commit 1b7898ee276b "powerpc/boot: Use the pre-boot decompression API" breaks boot
Hi, Heiner Could you send me a copy of the kernel .config (or which defconfig) that you're using, the name of the HW platform that you're using and if possible the kernel image itself? Thanks, Oliver
[PATCH 6/6] powerpc/boot: Add support for XZ compression
This patch adds an option to use XZ compression for the kernel image. Currently this is only enabled for PPC64 targets since the bulk of the 32bit platforms produce uboot images which do not use the wrapper. Signed-off-by: Oliver O'Halloran --- arch/powerpc/boot/Makefile | 3 +++ arch/powerpc/boot/decompress.c | 5 + arch/powerpc/boot/stdbool.h| 15 + arch/powerpc/boot/stdint.h | 13 arch/powerpc/boot/types.h | 14 arch/powerpc/boot/xz_config.h | 39 ++ arch/powerpc/platforms/Kconfig.cputype | 1 + 7 files changed, 90 insertions(+) create mode 100644 arch/powerpc/boot/stdbool.h create mode 100644 arch/powerpc/boot/stdint.h create mode 100644 arch/powerpc/boot/xz_config.h diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 9fb451d0586e..eae2dc8bc218 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -20,6 +20,7 @@ all: $(obj)/zImage compress-$(CONFIG_KERNEL_GZIP) := CONFIG_KERNEL_GZIP +compress-$(CONFIG_KERNEL_XZ) := CONFIG_KERNEL_XZ BOOTCFLAGS:= -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -Os -msoft-float -pipe \ @@ -226,6 +227,7 @@ endif endif compressor-$(CONFIG_KERNEL_GZIP) := gz +compressor-$(CONFIG_KERNEL_XZ) := xz # args (to if_changed): 1 = (this rule), 2 = platform, 3 = dts 4=dtb 5=initrd quiet_cmd_wrap = WRAP$@ @@ -433,6 +435,7 @@ clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* treeImage.* \ # clean up files cached by wrapper clean-kernel-base := vmlinux.strip vmlinux.bin clean-kernel := $(addsuffix .gz,$(clean-kernel-base)) +clean-kernel += $(addsuffix .xz,$(clean-kernel-base)) # If not absolute clean-files are relative to $(obj). clean-files += $(addprefix $(objtree)/, $(clean-kernel)) diff --git a/arch/powerpc/boot/decompress.c b/arch/powerpc/boot/decompress.c index 60fc6fb26867..8f32ea4289af 100644 --- a/arch/powerpc/boot/decompress.c +++ b/arch/powerpc/boot/decompress.c @@ -37,6 +37,11 @@ # include "decompress_inflate.c" #endif +#ifdef CONFIG_KERNEL_XZ +# include "xz_config.h" +# include "../../../lib/decompress_unxz.c" +#endif + /* globals for tracking the state of the decompression */ static unsigned long decompressed_bytes; static unsigned long limit; diff --git a/arch/powerpc/boot/stdbool.h b/arch/powerpc/boot/stdbool.h new file mode 100644 index ..2ebcfa53b4c7 --- /dev/null +++ b/arch/powerpc/boot/stdbool.h @@ -0,0 +1,15 @@ +/* + * Copyright (C) IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * This file is only necessary because some of the pre-boot decompressors + * expect stdbool.h to be available. + * + */ + +#include "types.h" + diff --git a/arch/powerpc/boot/stdint.h b/arch/powerpc/boot/stdint.h new file mode 100644 index ..c1c853be7490 --- /dev/null +++ b/arch/powerpc/boot/stdint.h @@ -0,0 +1,13 @@ +/* + * Copyright (C) IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * This file is only necessary because some of the pre-boot decompressors + * expect stdint.h to be available. + */ + +#include "types.h" diff --git a/arch/powerpc/boot/types.h b/arch/powerpc/boot/types.h index 85565a89bcc2..af6b66b842c4 100644 --- a/arch/powerpc/boot/types.h +++ b/arch/powerpc/boot/types.h @@ -1,6 +1,8 @@ #ifndef _TYPES_H_ #define _TYPES_H_ +#include + #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) typedef unsigned char u8; @@ -34,4 +36,16 @@ typedef s64 int64_t; (void) (&_x == &_y);\ _x > _y ? _x : _y; }) +#define min_t(type, a, b) min(((type) a), ((type) b)) +#define max_t(type, a, b) max(((type) a), ((type) b)) + +typedef int bool; + +#ifndef true +#define true 1 +#endif + +#ifndef false +#define false 0 +#endif #endif /* _TYPES_H_ */ diff --git a/arch/powerpc/boot/xz_config.h b/arch/powerpc/boot/xz_config.h new file mode 100644 index ..5c6afdbca642 --- /dev/null +++ b/arch/powerpc/boot/xz_config.h @@ -0,0 +1,39 @@ +#ifndef __XZ_CONFIG_H__ +#define __XZ_CONFIG_H__ + +/* + * most of this is copied from lib/xz/xz_private.h, we can't use their defines + * since the boot wrapper is not built in the same environment as the rest of + * the kernel. + */ + +#include "types.h" +#include "swab.h" + +static inline uint32_t swab32p(void *p) +{ + uint3
[PATCH 4/6] powerpc/boot: remove legacy gzip wrapper
This code is no longer used and can be removed. Signed-off-by: Oliver O'Halloran --- arch/powerpc/boot/cuboot-c2k.c | 1 - arch/powerpc/boot/gunzip_util.c | 204 arch/powerpc/boot/gunzip_util.h | 45 - 3 files changed, 250 deletions(-) delete mode 100644 arch/powerpc/boot/gunzip_util.c delete mode 100644 arch/powerpc/boot/gunzip_util.h diff --git a/arch/powerpc/boot/cuboot-c2k.c b/arch/powerpc/boot/cuboot-c2k.c index e43594950ba3..9309c51f1d65 100644 --- a/arch/powerpc/boot/cuboot-c2k.c +++ b/arch/powerpc/boot/cuboot-c2k.c @@ -18,7 +18,6 @@ #include "io.h" #include "ops.h" #include "elf.h" -#include "gunzip_util.h" #include "mv64x60.h" #include "cuboot.h" #include "ppcboot.h" diff --git a/arch/powerpc/boot/gunzip_util.c b/arch/powerpc/boot/gunzip_util.c deleted file mode 100644 index 9dc52501de83.. --- a/arch/powerpc/boot/gunzip_util.c +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Copyright 2007 David Gibson, IBM Corporation. - * Based on earlier work, Copyright (C) Paul Mackerras 1997. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include "string.h" -#include "stdio.h" -#include "ops.h" -#include "gunzip_util.h" - -#define HEAD_CRC 2 -#define EXTRA_FIELD4 -#define ORIG_NAME 8 -#define COMMENT0x10 -#define RESERVED 0xe0 - -/** - * gunzip_start - prepare to decompress gzip data - * @state: decompressor state structure to be initialized - * @src: buffer containing gzip compressed or uncompressed data - * @srclen:size in bytes of the buffer at src - * - * If the buffer at @src contains a gzip header, this function - * initializes zlib to decompress the data, storing the decompression - * state in @state. The other functions in this file can then be used - * to decompress data from the gzipped stream. - * - * If the buffer at @src does not contain a gzip header, it is assumed - * to contain uncompressed data. The buffer information is recorded - * in @state and the other functions in this file will simply copy - * data from the uncompressed data stream at @src. - * - * Any errors, such as bad compressed data, cause an error to be - * printed an the platform's exit() function to be called. - */ -void gunzip_start(struct gunzip_state *state, void *src, int srclen) -{ - char *hdr = src; - int hdrlen = 0; - - memset(state, 0, sizeof(*state)); - - /* Check for gzip magic number */ - if ((hdr[0] == 0x1f) && (hdr[1] == 0x8b)) { - /* gzip data, initialize zlib parameters */ - int r, flags; - - state->s.workspace = state->scratch; - if (zlib_inflate_workspacesize() > sizeof(state->scratch)) - fatal("insufficient scratch space for gunzip\n\r"); - - /* skip header */ - hdrlen = 10; - flags = hdr[3]; - if (hdr[2] != Z_DEFLATED || (flags & RESERVED) != 0) - fatal("bad gzipped data\n\r"); - if ((flags & EXTRA_FIELD) != 0) - hdrlen = 12 + hdr[10] + (hdr[11] << 8); - if ((flags & ORIG_NAME) != 0) - while (hdr[hdrlen++] != 0) - ; - if ((flags & COMMENT) != 0) - while (hdr[hdrlen++] != 0) - ; - if ((flags & HEAD_CRC) != 0) - hdrlen += 2; - if (hdrlen >= srclen) - fatal("gunzip_start: ran out of data in header\n\r"); - - r = zlib_inflateInit2(&state->s, -MAX_WBITS); - if (r != Z_OK) - fatal("inflateInit2 returned %d\n\r", r); - } - - state->s.total_in = hdrlen; - state->s.next_in = src + hdrlen; - state->s.avail_in = srclen - hdrlen; -} - -/** - * gunzip_partial - extract bytes from a gzip data stream - * @state: gzip state structure previously initialized by gunzip_start() - * @dst: buffer to store extracted data - * @dstlen:maximum number of bytes to extract - * - * This function extracts at most @dstlen bytes from the data stream - * previously associated with @state by gunzip_start(), decompressing - * if necessary. Exactly @dstlen bytes are extracted unless the data - * stream doesn't contain enough bytes, in which case the entire - * remainder of the stream is decompressed. - * - * Returns the actual number of bytes extracted. If any erro
[PATCH 5/6] powerpc/boot: add xz support to the wrapper script
This modifies the script so that the -Z option takes an argument to specify the compression type. It can either be 'gz', 'xz' or 'none'. The legazy --no-gzip and -z options are still supported and will set the compression to none and gzip respectively, but they are not documented. Only xz -6 is used for compression rather than xz -9. Using compression levels higher than 6 requires the decompressor to build a large (64MB) dictionary when decompressing and some environments cannot satisfy large allocations (e.g. POWER 6 LPAR partition firmware). Signed-off-by: Oliver O'Halloran --- arch/powerpc/boot/Makefile | 7 -- arch/powerpc/boot/wrapper | 61 ++ 2 files changed, 50 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 861348c72519..9fb451d0586e 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -225,10 +225,13 @@ CROSSWRAP := -C "$(CROSS_COMPILE)" endif endif +compressor-$(CONFIG_KERNEL_GZIP) := gz + # args (to if_changed): 1 = (this rule), 2 = platform, 3 = dts 4=dtb 5=initrd quiet_cmd_wrap = WRAP$@ - cmd_wrap =$(CONFIG_SHELL) $(wrapper) -c -o $@ -p $2 $(CROSSWRAP) \ - $(if $3, -s $3)$(if $4, -d $4)$(if $5, -i $5) vmlinux + cmd_wrap =$(CONFIG_SHELL) $(wrapper) -Z $(compressor-y) -c -o $@ -p $2 \ + $(CROSSWRAP) $(if $3, -s $3)$(if $4, -d $4)$(if $5, -i $5) \ + vmlinux image-$(CONFIG_PPC_PSERIES)+= zImage.pseries image-$(CONFIG_PPC_POWERNV)+= zImage.pseries diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index 6681ec3625c9..6feacfd87588 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -20,6 +20,8 @@ # -D dir specify directory containing data files used by script # (default ./arch/powerpc/boot) # -W dir specify working directory for temporary files (default .) +# -z use gzip (legacy) +# -Z zsuffixcompression to use (gz, xz or none) # Stop execution if any command fails set -e @@ -38,7 +40,7 @@ dtb= dts= cacheit= binary= -gzip=.gz +compression=.gz pie= format= @@ -59,7 +61,8 @@ tmpdir=. usage() { echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2 echo ' [-d devtree] [-s tree.dts] [-c] [-C cross-prefix]' >&2 -echo ' [-D datadir] [-W workingdir] [--no-gzip] [vmlinux]' >&2 +echo ' [-D datadir] [-W workingdir] [-Z (gz|xz|none)]' >&2 +echo ' [--no-compression] [vmlinux]' >&2 exit 1 } @@ -126,8 +129,24 @@ while [ "$#" -gt 0 ]; do [ "$#" -gt 0 ] || usage tmpdir="$1" ;; +-z) + compression=.gz + ;; +-Z) + shift + [ "$#" -gt 0 ] || usage +[ "$1" != "gz" -o "$1" != "xz" -o "$1" != "none" ] || usage + + compression=".$1" + +if [ $compression = ".none" ]; then +compression= +fi + ;; --no-gzip) -gzip= +# a "feature" of the the wrapper script is that it can be used outside +# the kernel tree. So keeping this around for backwards compatibility. +compression= ;; -?) usage @@ -140,6 +159,7 @@ while [ "$#" -gt 0 ]; do shift done + if [ -n "$dts" ]; then if [ ! -r "$dts" -a -r "$object/dts/$dts" ]; then dts="$object/dts/$dts" @@ -212,7 +232,7 @@ miboot|uboot*) ;; cuboot*) binary=y -gzip= +compression= case "$platform" in *-mpc866ads|*-mpc885ads|*-adder875*|*-ep88xc) platformo=$object/cuboot-8xx.o @@ -243,7 +263,7 @@ cuboot*) ps3) platformo="$object/ps3-head.o $object/ps3-hvcall.o $object/ps3.o" lds=$object/zImage.ps3.lds -gzip= +compression= ext=bin objflags="-O binary --set-section-flags=.bss=contents,alloc,load,data" ksection=.kernel:vmlinux.bin @@ -310,27 +330,37 @@ mvme7100) esac vmz="$tmpdir/`basename \"$kernel\"`.$ext" -if [ -z "$cacheit" -o ! -f "$vmz$gzip" -o "$vmz$gzip" -ot "$kernel" ]; then -${CROSS}objcopy $objflags "$kernel" "$vmz.$$" -strip_size=$(stat -c %s $vmz.$$) +# Calculate the vmlinux.strip size +${CROSS}objcopy $objflags "$kernel" "$vmz.$$" +strip_size=$(stat -c %s $vmz.$$) -if [ -n "$gzip" ]; then +if [ -z "$cacheit" -o ! -f "$vmz$compression" -o "$vmz$compression" -ot "$kernel" ]; then +# recompress the image if we need to +case $compression in +
[PATCH 3/6] powerpc/boot: use the preboot decompression API
Currently the powerpc boot wrapper has its own wrapper around zlib to handle decompressing gzipped kernels. The kernel decompressor library functions now provide a generic interface that can be used in the pre-boot environment. This allows boot wrappers to easily support different compression algorithms. This patch converts the wrapper to use this new API, but does not add support for using new algorithms. Signed-off-by: Oliver O'Halloran --- arch/powerpc/boot/Makefile | 34 +++--- arch/powerpc/boot/decompress.c | 142 + arch/powerpc/boot/main.c | 35 +- arch/powerpc/boot/ops.h| 3 + 4 files changed, 189 insertions(+), 25 deletions(-) create mode 100644 arch/powerpc/boot/decompress.c diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index bede555d78cf..861348c72519 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -63,13 +63,28 @@ $(obj)/treeboot-currituck.o: BOOTCFLAGS += -mcpu=405 $(obj)/treeboot-akebono.o: BOOTCFLAGS += -mcpu=405 $(obj)/virtex405-head.o: BOOTAFLAGS += -mcpu=405 -# the kernel's version of zlib pulls in a lot of other kernel headers -# which we don't provide inside the wrapper. +# The pre-boot decompressors pull in a lot of kernel headers and other source +# files. This creates a bit of a dependency headache since we need to copy +# these files into the build dir, fix up any includes and ensure that dependent +# files are copied in the right order. + +# these need to be seperate variables because they are copied out of different +# directories in the kernel tree. Sure you COULd merge them, but it's a +# cure-is-worse-than-disease situation. +zlib-decomp-$(CONFIG_KERNEL_GZIP) := decompress_inflate.c zlib-$(CONFIG_KERNEL_GZIP) := inffast.c inflate.c inftrees.c zlibheader-$(CONFIG_KERNEL_GZIP) := inffast.h inffixed.h inflate.h inftrees.h infutil.h zliblinuxheader-$(CONFIG_KERNEL_GZIP) := zlib.h zconf.h zutil.h -$(addprefix $(obj)/,$(zlib-y) cuboot-c2k.o gunzip_util.o main.o): \ +$(addprefix $(obj)/, decompress.o): \ + $(addprefix $(obj)/,$(zlib-decomp-y)) + +$(addprefix $(obj)/, $(zlib-decomp-y)): \ + $(addprefix $(obj)/,$(zliblinuxheader-y)) \ + $(addprefix $(obj)/,$(zlibheader-y)) \ + $(addprefix $(obj)/,$(zlib-y)) + +$(addprefix $(obj)/,$(zlib-y)): \ $(addprefix $(obj)/,$(zliblinuxheader-y)) \ $(addprefix $(obj)/,$(zlibheader-y)) @@ -79,10 +94,10 @@ libfdtheader := fdt.h libfdt.h libfdt_internal.h $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \ $(addprefix $(obj)/,$(libfdtheader)) -src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \ +src-wlib-y := string.S crt0.S crtsavres.S stdio.c decompress.c main.c \ $(libfdt) libfdt-wrapper.c \ ns16550.c serial.c simple_alloc.c div64.S util.S \ - gunzip_util.c elf_util.c $(zlib-y) devtree.c stdlib.c \ + elf_util.c $(zlib-y) devtree.c stdlib.c \ oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \ uartlite.c mpc52xx-psc.c opal.c opal-calls.S src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c @@ -143,6 +158,9 @@ $(addprefix $(obj)/,$(zlibheader-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/% $(addprefix $(obj)/,$(zliblinuxheader-y)): $(obj)/%: $(srctree)/include/linux/% $(call cmd,copy_kern_src) +$(addprefix $(obj)/,$(zlib-decomp-y)): $(obj)/%: $(srctree)/lib/% + $(call cmd,copy_kern_src) + quiet_cmd_copy_libfdt = COPY$@ cmd_copy_libfdt = cp $< $@ @@ -160,7 +178,7 @@ $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds : $(obj)/%: $(srctree)/$(src)/%.S $(Q)cp $< $@ clean-files := $(zlib-) $(zlibheader-) $(zliblinuxheader-) \ - $(libfdt) $(libfdtheader) \ + $(zlib-decomp-) $(libfdt) $(libfdtheader) \ empty.c zImage.coff.lds zImage.ps3.lds zImage.lds quiet_cmd_bootcc = BOOTCC $@ @@ -410,8 +428,8 @@ clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* treeImage.* \ zImage.maple simpleImage.* otheros.bld *.dtb # clean up files cached by wrapper -clean-kernel := vmlinux.strip vmlinux.bin -clean-kernel += $(addsuffix .gz,$(clean-kernel)) +clean-kernel-base := vmlinux.strip vmlinux.bin +clean-kernel := $(addsuffix .gz,$(clean-kernel-base)) # If not absolute clean-files are relative to $(obj). clean-files += $(addprefix $(objtree)/, $(clean-kernel)) diff --git a/arch/powerpc/boot/decompress.c b/arch/powerpc/boot/decompress.c new file mode 100644 index ..60fc6fb26867 --- /dev/null +++ b/arch/powerpc/boot/decompress.c @@ -0,0 +1,142 @@ +/* + * Wrapper around the kernel's pre-boot decompression library. + * + * Copyright (C) IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software F
[PATCH 2/6] powerpc/boot: Use CONFIG_KERNEL_GZIP
Most architectures allow the compression algorithm used to produced the vmlinuz image to be selected as a kernel config option. In preperation for supporting algorithms other than gzip in the powerpc boot wrapper the makefile needs to be modified to use these config options. Signed-off-by: Oliver O'Halloran --- arch/powerpc/Kconfig | 1 + arch/powerpc/boot/Makefile | 30 ++ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 5c295830e8c7..59e53f4552ae 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -161,6 +161,7 @@ config PPC select GENERIC_CPU_AUTOPROBE select HAVE_VIRT_CPU_ACCOUNTING select HAVE_ARCH_HARDENED_USERCOPY + select HAVE_KERNEL_GZIP config GENERIC_CSUM def_bool CPU_LITTLE_ENDIAN diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 7d6768253caa..bede555d78cf 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -19,10 +19,14 @@ all: $(obj)/zImage +compress-$(CONFIG_KERNEL_GZIP) := CONFIG_KERNEL_GZIP + BOOTCFLAGS:= -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -Os -msoft-float -pipe \ -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \ --isystem $(shell $(CROSS32CC) -print-file-name=include) +-isystem $(shell $(CROSS32CC) -print-file-name=include) \ +-D$(compress-y) + ifdef CONFIG_PPC64_BOOT_WRAPPER BOOTCFLAGS += -m64 endif @@ -59,13 +63,15 @@ $(obj)/treeboot-currituck.o: BOOTCFLAGS += -mcpu=405 $(obj)/treeboot-akebono.o: BOOTCFLAGS += -mcpu=405 $(obj)/virtex405-head.o: BOOTAFLAGS += -mcpu=405 +# the kernel's version of zlib pulls in a lot of other kernel headers +# which we don't provide inside the wrapper. +zlib-$(CONFIG_KERNEL_GZIP) := inffast.c inflate.c inftrees.c +zlibheader-$(CONFIG_KERNEL_GZIP) := inffast.h inffixed.h inflate.h inftrees.h infutil.h +zliblinuxheader-$(CONFIG_KERNEL_GZIP) := zlib.h zconf.h zutil.h -zlib := inffast.c inflate.c inftrees.c -zlibheader := inffast.h inffixed.h inflate.h inftrees.h infutil.h -zliblinuxheader := zlib.h zconf.h zutil.h - -$(addprefix $(obj)/,$(zlib) cuboot-c2k.o gunzip_util.o main.o): \ - $(addprefix $(obj)/,$(zliblinuxheader)) $(addprefix $(obj)/,$(zlibheader)) +$(addprefix $(obj)/,$(zlib-y) cuboot-c2k.o gunzip_util.o main.o): \ + $(addprefix $(obj)/,$(zliblinuxheader-y)) \ + $(addprefix $(obj)/,$(zlibheader-y)) libfdt := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c libfdtheader := fdt.h libfdt.h libfdt_internal.h @@ -76,7 +82,7 @@ $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \ src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \ $(libfdt) libfdt-wrapper.c \ ns16550.c serial.c simple_alloc.c div64.S util.S \ - gunzip_util.c elf_util.c $(zlib) devtree.c stdlib.c \ + gunzip_util.c elf_util.c $(zlib-y) devtree.c stdlib.c \ oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \ uartlite.c mpc52xx-psc.c opal.c opal-calls.S src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c @@ -128,13 +134,13 @@ obj-plat: $(libfdt) quiet_cmd_copy_kern_src = COPY$@ cmd_copy_kern_src = sed -f $(srctree)/arch/powerpc/boot/fixup-headers.sed $< > $@ -$(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/% +$(addprefix $(obj)/,$(zlib-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/% $(call cmd,copy_kern_src) -$(addprefix $(obj)/,$(zlibheader)): $(obj)/%: $(srctree)/lib/zlib_inflate/% +$(addprefix $(obj)/,$(zlibheader-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/% $(call cmd,copy_kern_src) -$(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/% +$(addprefix $(obj)/,$(zliblinuxheader-y)): $(obj)/%: $(srctree)/include/linux/% $(call cmd,copy_kern_src) quiet_cmd_copy_libfdt = COPY$@ @@ -153,7 +159,7 @@ $(obj)/zImage.lds: $(obj)/%: $(srctree)/$(src)/%.S $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds : $(obj)/%: $(srctree)/$(src)/%.S $(Q)cp $< $@ -clean-files := $(zlib) $(zlibheader) $(zliblinuxheader) \ +clean-files := $(zlib-) $(zlibheader-) $(zliblinuxheader-) \ $(libfdt) $(libfdtheader) \ empty.c zImage.coff.lds zImage.ps3.lds zImage.lds -- 2.5.5
[PATCH 1/6] powerpc/boot: add sed script
The powerpc boot wrapper is compiled with a separate "bootcc" toolchain rather than the toolchain used for the rest of the kernel. The main problem with this is that the wrapper does not have access to the kernel headers (without a lot of gross hacks). To get around this the required headers are copied into the build directory via several sed scripts which rewrite problematic includes. This patch moves these fixups out of the makefile into a separate .sed script file to clean up makefile slightly. Signed-off-by: Oliver O'Halloran --- arch/powerpc/boot/Makefile | 16 +--- arch/powerpc/boot/fixup-headers.sed | 12 2 files changed, 17 insertions(+), 11 deletions(-) create mode 100644 arch/powerpc/boot/fixup-headers.sed diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index df0fd406aed1..7d6768253caa 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -125,23 +125,17 @@ obj-wlib := $(addsuffix .o, $(basename $(addprefix $(obj)/, $(src-wlib obj-plat := $(addsuffix .o, $(basename $(addprefix $(obj)/, $(src-plat obj-plat: $(libfdt) -quiet_cmd_copy_zlib = COPY$@ - cmd_copy_zlib = sed "s@__used@@;s@]*\).*@\"\1\"@" $< > $@ - -quiet_cmd_copy_zlibheader = COPY$@ - cmd_copy_zlibheader = sed "s@]*\).*@\"\1\"@" $< > $@ -# stddef.h for NULL -quiet_cmd_copy_zliblinuxheader = COPY$@ - cmd_copy_zliblinuxheader = sed "s@@\"string.h\"@;s@@@;s@]*\).*@\"\1\"@" $< > $@ +quiet_cmd_copy_kern_src = COPY$@ + cmd_copy_kern_src = sed -f $(srctree)/arch/powerpc/boot/fixup-headers.sed $< > $@ $(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/% - $(call cmd,copy_zlib) + $(call cmd,copy_kern_src) $(addprefix $(obj)/,$(zlibheader)): $(obj)/%: $(srctree)/lib/zlib_inflate/% - $(call cmd,copy_zlibheader) + $(call cmd,copy_kern_src) $(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/% - $(call cmd,copy_zliblinuxheader) + $(call cmd,copy_kern_src) quiet_cmd_copy_libfdt = COPY$@ cmd_copy_libfdt = cp $< $@ diff --git a/arch/powerpc/boot/fixup-headers.sed b/arch/powerpc/boot/fixup-headers.sed new file mode 100644 index ..96362428eb37 --- /dev/null +++ b/arch/powerpc/boot/fixup-headers.sed @@ -0,0 +1,12 @@ +# Copyright 2016 IBM Corporation. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License version 2 or later as +# published by the Free Software Foundation. + +s@#include @@; +s@\"zlib_inflate/\([^\"]*\).*@"\1"@; +s@@@; + +s@__used@@; +s@]*\).*@"\1"@; -- 2.5.5
[v3] XZ compressed zImage support
This series adds support for using XZ compression in addition to gzip in the kernel boot wrapper. Currently this is only enabled for 64bit Book3S processors since it seems that some embedded platforms rely on uBoot (or similar) to decompress the image rather than having the kernel decompress itself. Enabling it for other platforms should be fairly straight forward though. Supporting other compression algorithms (like ARM and x86 do) is possible, but painful. Each algorithm includes some kernel headers even when the #defines that are supposed to make them usable in a pre-boot environment are set. Including kernel headers is an issue because on powerpc the boot wrapper is compiled with a different toolchain and possibly for a different target for backwards compatibility reasons*. This makes it difficult to include kernel headers since the include paths, etc are not setup for BOOTCC. This can be worked around by rewriting parts of the each decompressor with sed scripts, but the rewriting requried is specific to each decompressor. -oliver *powermacs have 32bit firmware that cannot directly load a 64bit kernel. A 64 bit big endian kernel has a 32bit wrapper to work around this. On 64bit little endian we don't have this legacy problem so the wrapper is also 64bit little endian, but the toolchain issues are still there. --- Changes from v1: fixed some missing dependecies in the Makefile that were causing random build breaks. Fixed "make clean" so that it would remove the files copied into arch/powerpc/boot/ when the wrapper was built. previously this series renamed "zlibheader" to "zlibheaders". There were consequences. Changes from v2: Adding missing stdint.h and stdbool.h Reduced XZ compression level from -9 to -6. Using compression levels above -6 requires the decompressor to construct a 64MB dictionary. The firmware on some platforms cannot satisfy large allocations (even when the memory is physically present) causing decompression failures. Luckily using the lower compression level doesn't have much of a penalty. ---
[PATCH v2 6/6] powerpc/boot: Add support for XZ compression
This patch adds an option to use XZ compression for the kernel image. Currently this is only enabled for PPC64 targets since the bulk of the 32bit platforms produce uboot images which do not use the wrapper. Signed-off-by: Oliver O'Halloran --- arch/powerpc/boot/Makefile | 3 +++ arch/powerpc/boot/decompress.c | 5 + arch/powerpc/boot/types.h | 10 + arch/powerpc/boot/xz_config.h | 39 ++ arch/powerpc/platforms/Kconfig.cputype | 1 + 5 files changed, 58 insertions(+) create mode 100644 arch/powerpc/boot/xz_config.h diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 9fb451d0586e..eae2dc8bc218 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -20,6 +20,7 @@ all: $(obj)/zImage compress-$(CONFIG_KERNEL_GZIP) := CONFIG_KERNEL_GZIP +compress-$(CONFIG_KERNEL_XZ) := CONFIG_KERNEL_XZ BOOTCFLAGS:= -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -Os -msoft-float -pipe \ @@ -226,6 +227,7 @@ endif endif compressor-$(CONFIG_KERNEL_GZIP) := gz +compressor-$(CONFIG_KERNEL_XZ) := xz # args (to if_changed): 1 = (this rule), 2 = platform, 3 = dts 4=dtb 5=initrd quiet_cmd_wrap = WRAP$@ @@ -433,6 +435,7 @@ clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* treeImage.* \ # clean up files cached by wrapper clean-kernel-base := vmlinux.strip vmlinux.bin clean-kernel := $(addsuffix .gz,$(clean-kernel-base)) +clean-kernel += $(addsuffix .xz,$(clean-kernel-base)) # If not absolute clean-files are relative to $(obj). clean-files += $(addprefix $(objtree)/, $(clean-kernel)) diff --git a/arch/powerpc/boot/decompress.c b/arch/powerpc/boot/decompress.c index 60fc6fb26867..8f32ea4289af 100644 --- a/arch/powerpc/boot/decompress.c +++ b/arch/powerpc/boot/decompress.c @@ -37,6 +37,11 @@ # include "decompress_inflate.c" #endif +#ifdef CONFIG_KERNEL_XZ +# include "xz_config.h" +# include "../../../lib/decompress_unxz.c" +#endif + /* globals for tracking the state of the decompression */ static unsigned long decompressed_bytes; static unsigned long limit; diff --git a/arch/powerpc/boot/types.h b/arch/powerpc/boot/types.h index 85565a89bcc2..0362a262a299 100644 --- a/arch/powerpc/boot/types.h +++ b/arch/powerpc/boot/types.h @@ -34,4 +34,14 @@ typedef s64 int64_t; (void) (&_x == &_y);\ _x > _y ? _x : _y; }) +#define min_t(type, a, b) min(((type) a), ((type) b)) +#define max_t(type, a, b) max(((type) a), ((type) b)) + +#ifndef true +#define true 1 +#endif + +#ifndef false +#define false 0 +#endif #endif /* _TYPES_H_ */ diff --git a/arch/powerpc/boot/xz_config.h b/arch/powerpc/boot/xz_config.h new file mode 100644 index ..5c6afdbca642 --- /dev/null +++ b/arch/powerpc/boot/xz_config.h @@ -0,0 +1,39 @@ +#ifndef __XZ_CONFIG_H__ +#define __XZ_CONFIG_H__ + +/* + * most of this is copied from lib/xz/xz_private.h, we can't use their defines + * since the boot wrapper is not built in the same environment as the rest of + * the kernel. + */ + +#include "types.h" +#include "swab.h" + +static inline uint32_t swab32p(void *p) +{ + uint32_t *q = p; + + return swab32(*q); +} + +#ifdef __LITTLE_ENDIAN__ +#define get_le32(p) (*((uint32_t *) (p))) +#else +#define get_le32(p) swab32p(p) +#endif + +#define memeq(a, b, size) (memcmp(a, b, size) == 0) +#define memzero(buf, size) memset(buf, 0, size) + +/* prevent the inclusion of the xz-preboot MM headers */ +#define DECOMPR_MM_H +#define memmove memmove +#define XZ_EXTERN static + +/* xz.h needs to be included directly since we need enum xz_mode */ +#include "../../../include/linux/xz.h" + +#undef XZ_EXTERN + +#endif diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index f32edec13fd1..d5da55b01027 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -2,6 +2,7 @@ config PPC64 bool "64-bit kernel" default n select ZLIB_DEFLATE + select HAVE_KERNEL_XZ help This option selects whether a 32-bit or a 64-bit kernel will be built. -- 2.5.5
[PATCH v2 5/6] powerpc/boot: add xz support to the wrapper script
This modifies the script so that the -Z option takes an argument to specify the compression type. It can either be 'gz', 'xz' or 'none'. The legazy --no-gzip and -z options are still supported and will set the compression to none and gzip respectively, but they are not documented. Signed-off-by: Oliver O'Halloran --- arch/powerpc/boot/Makefile | 7 -- arch/powerpc/boot/wrapper | 61 ++ 2 files changed, 50 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 861348c72519..9fb451d0586e 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -225,10 +225,13 @@ CROSSWRAP := -C "$(CROSS_COMPILE)" endif endif +compressor-$(CONFIG_KERNEL_GZIP) := gz + # args (to if_changed): 1 = (this rule), 2 = platform, 3 = dts 4=dtb 5=initrd quiet_cmd_wrap = WRAP$@ - cmd_wrap =$(CONFIG_SHELL) $(wrapper) -c -o $@ -p $2 $(CROSSWRAP) \ - $(if $3, -s $3)$(if $4, -d $4)$(if $5, -i $5) vmlinux + cmd_wrap =$(CONFIG_SHELL) $(wrapper) -Z $(compressor-y) -c -o $@ -p $2 \ + $(CROSSWRAP) $(if $3, -s $3)$(if $4, -d $4)$(if $5, -i $5) \ + vmlinux image-$(CONFIG_PPC_PSERIES)+= zImage.pseries image-$(CONFIG_PPC_POWERNV)+= zImage.pseries diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index 6681ec3625c9..cf7631be5007 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -20,6 +20,8 @@ # -D dir specify directory containing data files used by script # (default ./arch/powerpc/boot) # -W dir specify working directory for temporary files (default .) +# -z use gzip (legacy) +# -Z zsuffixcompression to use (gz, xz or none) # Stop execution if any command fails set -e @@ -38,7 +40,7 @@ dtb= dts= cacheit= binary= -gzip=.gz +compression=.gz pie= format= @@ -59,7 +61,8 @@ tmpdir=. usage() { echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2 echo ' [-d devtree] [-s tree.dts] [-c] [-C cross-prefix]' >&2 -echo ' [-D datadir] [-W workingdir] [--no-gzip] [vmlinux]' >&2 +echo ' [-D datadir] [-W workingdir] [-Z (gz|xz|none)]' >&2 +echo ' [--no-compression] [vmlinux]' >&2 exit 1 } @@ -126,8 +129,24 @@ while [ "$#" -gt 0 ]; do [ "$#" -gt 0 ] || usage tmpdir="$1" ;; +-z) + compression=.gz + ;; +-Z) + shift + [ "$#" -gt 0 ] || usage +[ "$1" != "gz" -o "$1" != "xz" -o "$1" != "none" ] || usage + + compression=".$1" + +if [ $compression = ".none" ]; then +compression= +fi + ;; --no-gzip) -gzip= +# a "feature" of the the wrapper script is that it can be used outside +# the kernel tree. So keeping this around for backwards compatibility. +compression= ;; -?) usage @@ -140,6 +159,7 @@ while [ "$#" -gt 0 ]; do shift done + if [ -n "$dts" ]; then if [ ! -r "$dts" -a -r "$object/dts/$dts" ]; then dts="$object/dts/$dts" @@ -212,7 +232,7 @@ miboot|uboot*) ;; cuboot*) binary=y -gzip= +compression= case "$platform" in *-mpc866ads|*-mpc885ads|*-adder875*|*-ep88xc) platformo=$object/cuboot-8xx.o @@ -243,7 +263,7 @@ cuboot*) ps3) platformo="$object/ps3-head.o $object/ps3-hvcall.o $object/ps3.o" lds=$object/zImage.ps3.lds -gzip= +compression= ext=bin objflags="-O binary --set-section-flags=.bss=contents,alloc,load,data" ksection=.kernel:vmlinux.bin @@ -310,27 +330,37 @@ mvme7100) esac vmz="$tmpdir/`basename \"$kernel\"`.$ext" -if [ -z "$cacheit" -o ! -f "$vmz$gzip" -o "$vmz$gzip" -ot "$kernel" ]; then -${CROSS}objcopy $objflags "$kernel" "$vmz.$$" -strip_size=$(stat -c %s $vmz.$$) +# Calculate the vmlinux.strip size +${CROSS}objcopy $objflags "$kernel" "$vmz.$$" +strip_size=$(stat -c %s $vmz.$$) -if [ -n "$gzip" ]; then +if [ -z "$cacheit" -o ! -f "$vmz$compression" -o "$vmz$compression" -ot "$kernel" ]; then +# recompress the image if we need to +case $compression in +.xz) +xz --check=crc32 -f -9 "$vmz.$$" +;; +.gz) gzip -n -f -9 "$vmz.$$" -fi +;; +*) +# drop the compression suffix so the stripped vmlinux is used +compression= + ;; +esac if [
[PATCH v2 4/6] powerpc/boot: remove legacy gzip wrapper
This code is no longer used and can be removed. Signed-off-by: Oliver O'Halloran --- arch/powerpc/boot/cuboot-c2k.c | 1 - arch/powerpc/boot/gunzip_util.c | 204 arch/powerpc/boot/gunzip_util.h | 45 - 3 files changed, 250 deletions(-) delete mode 100644 arch/powerpc/boot/gunzip_util.c delete mode 100644 arch/powerpc/boot/gunzip_util.h diff --git a/arch/powerpc/boot/cuboot-c2k.c b/arch/powerpc/boot/cuboot-c2k.c index e43594950ba3..9309c51f1d65 100644 --- a/arch/powerpc/boot/cuboot-c2k.c +++ b/arch/powerpc/boot/cuboot-c2k.c @@ -18,7 +18,6 @@ #include "io.h" #include "ops.h" #include "elf.h" -#include "gunzip_util.h" #include "mv64x60.h" #include "cuboot.h" #include "ppcboot.h" diff --git a/arch/powerpc/boot/gunzip_util.c b/arch/powerpc/boot/gunzip_util.c deleted file mode 100644 index 9dc52501de83.. --- a/arch/powerpc/boot/gunzip_util.c +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Copyright 2007 David Gibson, IBM Corporation. - * Based on earlier work, Copyright (C) Paul Mackerras 1997. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include "string.h" -#include "stdio.h" -#include "ops.h" -#include "gunzip_util.h" - -#define HEAD_CRC 2 -#define EXTRA_FIELD4 -#define ORIG_NAME 8 -#define COMMENT0x10 -#define RESERVED 0xe0 - -/** - * gunzip_start - prepare to decompress gzip data - * @state: decompressor state structure to be initialized - * @src: buffer containing gzip compressed or uncompressed data - * @srclen:size in bytes of the buffer at src - * - * If the buffer at @src contains a gzip header, this function - * initializes zlib to decompress the data, storing the decompression - * state in @state. The other functions in this file can then be used - * to decompress data from the gzipped stream. - * - * If the buffer at @src does not contain a gzip header, it is assumed - * to contain uncompressed data. The buffer information is recorded - * in @state and the other functions in this file will simply copy - * data from the uncompressed data stream at @src. - * - * Any errors, such as bad compressed data, cause an error to be - * printed an the platform's exit() function to be called. - */ -void gunzip_start(struct gunzip_state *state, void *src, int srclen) -{ - char *hdr = src; - int hdrlen = 0; - - memset(state, 0, sizeof(*state)); - - /* Check for gzip magic number */ - if ((hdr[0] == 0x1f) && (hdr[1] == 0x8b)) { - /* gzip data, initialize zlib parameters */ - int r, flags; - - state->s.workspace = state->scratch; - if (zlib_inflate_workspacesize() > sizeof(state->scratch)) - fatal("insufficient scratch space for gunzip\n\r"); - - /* skip header */ - hdrlen = 10; - flags = hdr[3]; - if (hdr[2] != Z_DEFLATED || (flags & RESERVED) != 0) - fatal("bad gzipped data\n\r"); - if ((flags & EXTRA_FIELD) != 0) - hdrlen = 12 + hdr[10] + (hdr[11] << 8); - if ((flags & ORIG_NAME) != 0) - while (hdr[hdrlen++] != 0) - ; - if ((flags & COMMENT) != 0) - while (hdr[hdrlen++] != 0) - ; - if ((flags & HEAD_CRC) != 0) - hdrlen += 2; - if (hdrlen >= srclen) - fatal("gunzip_start: ran out of data in header\n\r"); - - r = zlib_inflateInit2(&state->s, -MAX_WBITS); - if (r != Z_OK) - fatal("inflateInit2 returned %d\n\r", r); - } - - state->s.total_in = hdrlen; - state->s.next_in = src + hdrlen; - state->s.avail_in = srclen - hdrlen; -} - -/** - * gunzip_partial - extract bytes from a gzip data stream - * @state: gzip state structure previously initialized by gunzip_start() - * @dst: buffer to store extracted data - * @dstlen:maximum number of bytes to extract - * - * This function extracts at most @dstlen bytes from the data stream - * previously associated with @state by gunzip_start(), decompressing - * if necessary. Exactly @dstlen bytes are extracted unless the data - * stream doesn't contain enough bytes, in which case the entire - * remainder of the stream is decompressed. - * - * Returns the actual number of bytes extracted. If any erro
[PATCH v2 3/6] powerpc/boot: use the preboot decompression API
Currently the powerpc boot wrapper has its own wrapper around zlib to handle decompressing gzipped kernels. The kernel decompressor library functions now provide a generic interface that can be used in the pre-boot environment. This allows boot wrappers to easily support different compression algorithms. This patch converts the wrapper to use this new API, but does not add support for using new algorithms. Signed-off-by: Oliver O'Halloran --- arch/powerpc/boot/Makefile | 34 +++--- arch/powerpc/boot/decompress.c | 142 + arch/powerpc/boot/main.c | 35 +- arch/powerpc/boot/ops.h| 3 + 4 files changed, 189 insertions(+), 25 deletions(-) create mode 100644 arch/powerpc/boot/decompress.c diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index bede555d78cf..861348c72519 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -63,13 +63,28 @@ $(obj)/treeboot-currituck.o: BOOTCFLAGS += -mcpu=405 $(obj)/treeboot-akebono.o: BOOTCFLAGS += -mcpu=405 $(obj)/virtex405-head.o: BOOTAFLAGS += -mcpu=405 -# the kernel's version of zlib pulls in a lot of other kernel headers -# which we don't provide inside the wrapper. +# The pre-boot decompressors pull in a lot of kernel headers and other source +# files. This creates a bit of a dependency headache since we need to copy +# these files into the build dir, fix up any includes and ensure that dependent +# files are copied in the right order. + +# these need to be separate variables because they are copied out of different +# directories in the kernel tree. Sure you COULd merge them, but it's a +# cure-is-worse-than-disease situation. +zlib-decomp-$(CONFIG_KERNEL_GZIP) := decompress_inflate.c zlib-$(CONFIG_KERNEL_GZIP) := inffast.c inflate.c inftrees.c zlibheader-$(CONFIG_KERNEL_GZIP) := inffast.h inffixed.h inflate.h inftrees.h infutil.h zliblinuxheader-$(CONFIG_KERNEL_GZIP) := zlib.h zconf.h zutil.h -$(addprefix $(obj)/,$(zlib-y) cuboot-c2k.o gunzip_util.o main.o): \ +$(addprefix $(obj)/, decompress.o): \ + $(addprefix $(obj)/,$(zlib-decomp-y)) + +$(addprefix $(obj)/, $(zlib-decomp-y)): \ + $(addprefix $(obj)/,$(zliblinuxheader-y)) \ + $(addprefix $(obj)/,$(zlibheader-y)) \ + $(addprefix $(obj)/,$(zlib-y)) + +$(addprefix $(obj)/,$(zlib-y)): \ $(addprefix $(obj)/,$(zliblinuxheader-y)) \ $(addprefix $(obj)/,$(zlibheader-y)) @@ -79,10 +94,10 @@ libfdtheader := fdt.h libfdt.h libfdt_internal.h $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \ $(addprefix $(obj)/,$(libfdtheader)) -src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \ +src-wlib-y := string.S crt0.S crtsavres.S stdio.c decompress.c main.c \ $(libfdt) libfdt-wrapper.c \ ns16550.c serial.c simple_alloc.c div64.S util.S \ - gunzip_util.c elf_util.c $(zlib-y) devtree.c stdlib.c \ + elf_util.c $(zlib-y) devtree.c stdlib.c \ oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \ uartlite.c mpc52xx-psc.c opal.c opal-calls.S src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c @@ -143,6 +158,9 @@ $(addprefix $(obj)/,$(zlibheader-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/% $(addprefix $(obj)/,$(zliblinuxheader-y)): $(obj)/%: $(srctree)/include/linux/% $(call cmd,copy_kern_src) +$(addprefix $(obj)/,$(zlib-decomp-y)): $(obj)/%: $(srctree)/lib/% + $(call cmd,copy_kern_src) + quiet_cmd_copy_libfdt = COPY$@ cmd_copy_libfdt = cp $< $@ @@ -160,7 +178,7 @@ $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds : $(obj)/%: $(srctree)/$(src)/%.S $(Q)cp $< $@ clean-files := $(zlib-) $(zlibheader-) $(zliblinuxheader-) \ - $(libfdt) $(libfdtheader) \ + $(zlib-decomp-) $(libfdt) $(libfdtheader) \ empty.c zImage.coff.lds zImage.ps3.lds zImage.lds quiet_cmd_bootcc = BOOTCC $@ @@ -410,8 +428,8 @@ clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* treeImage.* \ zImage.maple simpleImage.* otheros.bld *.dtb # clean up files cached by wrapper -clean-kernel := vmlinux.strip vmlinux.bin -clean-kernel += $(addsuffix .gz,$(clean-kernel)) +clean-kernel-base := vmlinux.strip vmlinux.bin +clean-kernel := $(addsuffix .gz,$(clean-kernel-base)) # If not absolute clean-files are relative to $(obj). clean-files += $(addprefix $(objtree)/, $(clean-kernel)) diff --git a/arch/powerpc/boot/decompress.c b/arch/powerpc/boot/decompress.c new file mode 100644 index ..60fc6fb26867 --- /dev/null +++ b/arch/powerpc/boot/decompress.c @@ -0,0 +1,142 @@ +/* + * Wrapper around the kernel's pre-boot decompression library. + * + * Copyright (C) IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software F
[PATCH v2 2/6] powerpc/boot: Use CONFIG_KERNEL_GZIP
Most architectures allow the compression algorithm used to produced the vmlinuz image to be selected as a kernel config option. In preperation for supporting algorithms other than gzip in the powerpc boot wrapper the makefile needs to be modified to use these config options. Signed-off-by: Oliver O'Halloran --- arch/powerpc/Kconfig | 1 + arch/powerpc/boot/Makefile | 30 ++ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 914983a29156..aa96bda118aa 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -161,6 +161,7 @@ config PPC select GENERIC_CPU_AUTOPROBE select HAVE_VIRT_CPU_ACCOUNTING select HAVE_ARCH_HARDENED_USERCOPY + select HAVE_KERNEL_GZIP config GENERIC_CSUM def_bool CPU_LITTLE_ENDIAN diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 7d6768253caa..bede555d78cf 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -19,10 +19,14 @@ all: $(obj)/zImage +compress-$(CONFIG_KERNEL_GZIP) := CONFIG_KERNEL_GZIP + BOOTCFLAGS:= -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -Os -msoft-float -pipe \ -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \ --isystem $(shell $(CROSS32CC) -print-file-name=include) +-isystem $(shell $(CROSS32CC) -print-file-name=include) \ +-D$(compress-y) + ifdef CONFIG_PPC64_BOOT_WRAPPER BOOTCFLAGS += -m64 endif @@ -59,13 +63,15 @@ $(obj)/treeboot-currituck.o: BOOTCFLAGS += -mcpu=405 $(obj)/treeboot-akebono.o: BOOTCFLAGS += -mcpu=405 $(obj)/virtex405-head.o: BOOTAFLAGS += -mcpu=405 +# the kernel's version of zlib pulls in a lot of other kernel headers +# which we don't provide inside the wrapper. +zlib-$(CONFIG_KERNEL_GZIP) := inffast.c inflate.c inftrees.c +zlibheader-$(CONFIG_KERNEL_GZIP) := inffast.h inffixed.h inflate.h inftrees.h infutil.h +zliblinuxheader-$(CONFIG_KERNEL_GZIP) := zlib.h zconf.h zutil.h -zlib := inffast.c inflate.c inftrees.c -zlibheader := inffast.h inffixed.h inflate.h inftrees.h infutil.h -zliblinuxheader := zlib.h zconf.h zutil.h - -$(addprefix $(obj)/,$(zlib) cuboot-c2k.o gunzip_util.o main.o): \ - $(addprefix $(obj)/,$(zliblinuxheader)) $(addprefix $(obj)/,$(zlibheader)) +$(addprefix $(obj)/,$(zlib-y) cuboot-c2k.o gunzip_util.o main.o): \ + $(addprefix $(obj)/,$(zliblinuxheader-y)) \ + $(addprefix $(obj)/,$(zlibheader-y)) libfdt := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c libfdtheader := fdt.h libfdt.h libfdt_internal.h @@ -76,7 +82,7 @@ $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \ src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \ $(libfdt) libfdt-wrapper.c \ ns16550.c serial.c simple_alloc.c div64.S util.S \ - gunzip_util.c elf_util.c $(zlib) devtree.c stdlib.c \ + gunzip_util.c elf_util.c $(zlib-y) devtree.c stdlib.c \ oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \ uartlite.c mpc52xx-psc.c opal.c opal-calls.S src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c @@ -128,13 +134,13 @@ obj-plat: $(libfdt) quiet_cmd_copy_kern_src = COPY$@ cmd_copy_kern_src = sed -f $(srctree)/arch/powerpc/boot/fixup-headers.sed $< > $@ -$(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/% +$(addprefix $(obj)/,$(zlib-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/% $(call cmd,copy_kern_src) -$(addprefix $(obj)/,$(zlibheader)): $(obj)/%: $(srctree)/lib/zlib_inflate/% +$(addprefix $(obj)/,$(zlibheader-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/% $(call cmd,copy_kern_src) -$(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/% +$(addprefix $(obj)/,$(zliblinuxheader-y)): $(obj)/%: $(srctree)/include/linux/% $(call cmd,copy_kern_src) quiet_cmd_copy_libfdt = COPY$@ @@ -153,7 +159,7 @@ $(obj)/zImage.lds: $(obj)/%: $(srctree)/$(src)/%.S $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds : $(obj)/%: $(srctree)/$(src)/%.S $(Q)cp $< $@ -clean-files := $(zlib) $(zlibheader) $(zliblinuxheader) \ +clean-files := $(zlib-) $(zlibheader-) $(zliblinuxheader-) \ $(libfdt) $(libfdtheader) \ empty.c zImage.coff.lds zImage.ps3.lds zImage.lds -- 2.5.5
[PATCH v2 1/6] powerpc/boot: add sed script
The powerpc boot wrapper is compiled with a separate "bootcc" toolchain rather than the toolchain used for the rest of the kernel. The main problem with this is that the wrapper does not have access to the kernel headers (without a lot of gross hacks). To get around this the required headers are copied into the build directory via several sed scripts which rewrite problematic includes. This patch moves these fixups out of the makefile into a separate .sed script file to clean up makefile slightly. Signed-off-by: Oliver O'Halloran --- arch/powerpc/boot/Makefile | 16 +--- arch/powerpc/boot/fixup-headers.sed | 12 2 files changed, 17 insertions(+), 11 deletions(-) create mode 100644 arch/powerpc/boot/fixup-headers.sed diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index df0fd406aed1..7d6768253caa 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -125,23 +125,17 @@ obj-wlib := $(addsuffix .o, $(basename $(addprefix $(obj)/, $(src-wlib obj-plat := $(addsuffix .o, $(basename $(addprefix $(obj)/, $(src-plat obj-plat: $(libfdt) -quiet_cmd_copy_zlib = COPY$@ - cmd_copy_zlib = sed "s@__used@@;s@]*\).*@\"\1\"@" $< > $@ - -quiet_cmd_copy_zlibheader = COPY$@ - cmd_copy_zlibheader = sed "s@]*\).*@\"\1\"@" $< > $@ -# stddef.h for NULL -quiet_cmd_copy_zliblinuxheader = COPY$@ - cmd_copy_zliblinuxheader = sed "s@@\"string.h\"@;s@@@;s@]*\).*@\"\1\"@" $< > $@ +quiet_cmd_copy_kern_src = COPY$@ + cmd_copy_kern_src = sed -f $(srctree)/arch/powerpc/boot/fixup-headers.sed $< > $@ $(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/% - $(call cmd,copy_zlib) + $(call cmd,copy_kern_src) $(addprefix $(obj)/,$(zlibheader)): $(obj)/%: $(srctree)/lib/zlib_inflate/% - $(call cmd,copy_zlibheader) + $(call cmd,copy_kern_src) $(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/% - $(call cmd,copy_zliblinuxheader) + $(call cmd,copy_kern_src) quiet_cmd_copy_libfdt = COPY$@ cmd_copy_libfdt = cp $< $@ diff --git a/arch/powerpc/boot/fixup-headers.sed b/arch/powerpc/boot/fixup-headers.sed new file mode 100644 index ..96362428eb37 --- /dev/null +++ b/arch/powerpc/boot/fixup-headers.sed @@ -0,0 +1,12 @@ +# Copyright 2016 IBM Corporation. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License version 2 or later as +# published by the Free Software Foundation. + +s@#include @@; +s@\"zlib_inflate/\([^\"]*\).*@"\1"@; +s@@@; + +s@__used@@; +s@]*\).*@"\1"@; -- 2.5.5
[v2] XZ compressed zImage support
This series adds support for using XZ compression in addition to gzip in the kernel boot wrapper. Currently this is only enabled for 64bit Book3S processors since it seems that some embedded platforms rely on uBoot (or similar) to decompress the image rather than having the kernel decompress itself. Enabling it for other platforms should be fairly straight forward though. Supporting other compression algorithms (like ARM and x86 do) is possible, but painful. Each algorithm includes some kernel headers even when the #defines that are supposed to make them usable in a pre-boot environment are set. Including kernel headers is an issue because on powerpc the boot wrapper is compiled with a different toolchain and possibly for a different target for backwards compatibility reasons*. This makes it difficult to include kernel headers since the include paths, etc are not setup for BOOTCC. This can be worked around by rewriting parts of the each decompressor with sed scripts, but the rewriting requried is specific to each decompressor. -oliver *powermacs have 32bit firmware that cannot directly load a 64bit kernel. A 64 bit big endian kernel has a 32bit wrapper to work around this. On 64bit little endian we don't have this legacy problem so the wrapper is also 64bit little endian, but the toolchain issues are still there. --- Changes from v1: fixed some missing dependecies in the Makefile that were causing random build breaks. Fixed "make clean" so that it would remove the files copied into arch/powerpc/boot/ when the wrapper was built. previously this series renamed "zlibheader" to "zlibheaders". There were consequences. ---
[PATCH 6/6] powerpc/boot: Add support for XZ compression
This patch adds an option to use XZ compression for the kernel image. Currently this is only enabled for PPC64 targets since the bulk of the 32bit platforms produce uboot images which do not use the wrapper. Signed-off-by: Oliver O'Halloran --- arch/powerpc/boot/Makefile | 2 ++ arch/powerpc/boot/decompress.c | 5 + arch/powerpc/boot/types.h | 10 + arch/powerpc/boot/xz_config.h | 39 ++ arch/powerpc/platforms/Kconfig.cputype | 1 + 5 files changed, 57 insertions(+) create mode 100644 arch/powerpc/boot/xz_config.h diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 482bac2af1ff..de36806c1a73 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -20,6 +20,7 @@ all: $(obj)/zImage compress-$(CONFIG_KERNEL_GZIP) := CONFIG_KERNEL_GZIP +compress-$(CONFIG_KERNEL_XZ) := CONFIG_KERNEL_XZ BOOTCFLAGS:= -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -Os -msoft-float -pipe \ @@ -213,6 +214,7 @@ endif endif compressor-$(CONFIG_KERNEL_GZIP) := gz +compressor-$(CONFIG_KERNEL_XZ) := xz # args (to if_changed): 1 = (this rule), 2 = platform, 3 = dts 4=dtb 5=initrd quiet_cmd_wrap = WRAP$@ diff --git a/arch/powerpc/boot/decompress.c b/arch/powerpc/boot/decompress.c index 60fc6fb26867..8f32ea4289af 100644 --- a/arch/powerpc/boot/decompress.c +++ b/arch/powerpc/boot/decompress.c @@ -37,6 +37,11 @@ # include "decompress_inflate.c" #endif +#ifdef CONFIG_KERNEL_XZ +# include "xz_config.h" +# include "../../../lib/decompress_unxz.c" +#endif + /* globals for tracking the state of the decompression */ static unsigned long decompressed_bytes; static unsigned long limit; diff --git a/arch/powerpc/boot/types.h b/arch/powerpc/boot/types.h index 85565a89bcc2..0362a262a299 100644 --- a/arch/powerpc/boot/types.h +++ b/arch/powerpc/boot/types.h @@ -34,4 +34,14 @@ typedef s64 int64_t; (void) (&_x == &_y);\ _x > _y ? _x : _y; }) +#define min_t(type, a, b) min(((type) a), ((type) b)) +#define max_t(type, a, b) max(((type) a), ((type) b)) + +#ifndef true +#define true 1 +#endif + +#ifndef false +#define false 0 +#endif #endif /* _TYPES_H_ */ diff --git a/arch/powerpc/boot/xz_config.h b/arch/powerpc/boot/xz_config.h new file mode 100644 index ..5c6afdbca642 --- /dev/null +++ b/arch/powerpc/boot/xz_config.h @@ -0,0 +1,39 @@ +#ifndef __XZ_CONFIG_H__ +#define __XZ_CONFIG_H__ + +/* + * most of this is copied from lib/xz/xz_private.h, we can't use their defines + * since the boot wrapper is not built in the same environment as the rest of + * the kernel. + */ + +#include "types.h" +#include "swab.h" + +static inline uint32_t swab32p(void *p) +{ + uint32_t *q = p; + + return swab32(*q); +} + +#ifdef __LITTLE_ENDIAN__ +#define get_le32(p) (*((uint32_t *) (p))) +#else +#define get_le32(p) swab32p(p) +#endif + +#define memeq(a, b, size) (memcmp(a, b, size) == 0) +#define memzero(buf, size) memset(buf, 0, size) + +/* prevent the inclusion of the xz-preboot MM headers */ +#define DECOMPR_MM_H +#define memmove memmove +#define XZ_EXTERN static + +/* xz.h needs to be included directly since we need enum xz_mode */ +#include "../../../include/linux/xz.h" + +#undef XZ_EXTERN + +#endif diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index f32edec13fd1..d5da55b01027 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -2,6 +2,7 @@ config PPC64 bool "64-bit kernel" default n select ZLIB_DEFLATE + select HAVE_KERNEL_XZ help This option selects whether a 32-bit or a 64-bit kernel will be built. -- 2.5.5
[PATCH 5/6] powerpc/boot: add xz support to the wrapper script
This modifies the script so that the -Z option takes an argument to specify the compression type. It can either be 'gz', 'xz' or 'none'. The legazy --no-gzip and -z options are still supported and will set the compression to none and gzip respectively, but they are not documented. Signed-off-by: Oliver O'Halloran --- arch/powerpc/boot/Makefile | 7 -- arch/powerpc/boot/wrapper | 61 ++ 2 files changed, 50 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 3fdd74ac2fae..482bac2af1ff 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -212,10 +212,13 @@ CROSSWRAP := -C "$(CROSS_COMPILE)" endif endif +compressor-$(CONFIG_KERNEL_GZIP) := gz + # args (to if_changed): 1 = (this rule), 2 = platform, 3 = dts 4=dtb 5=initrd quiet_cmd_wrap = WRAP$@ - cmd_wrap =$(CONFIG_SHELL) $(wrapper) -c -o $@ -p $2 $(CROSSWRAP) \ - $(if $3, -s $3)$(if $4, -d $4)$(if $5, -i $5) vmlinux + cmd_wrap =$(CONFIG_SHELL) $(wrapper) -Z $(compressor-y) -c -o $@ -p $2 \ + $(CROSSWRAP) $(if $3, -s $3)$(if $4, -d $4)$(if $5, -i $5) \ + vmlinux image-$(CONFIG_PPC_PSERIES)+= zImage.pseries image-$(CONFIG_PPC_POWERNV)+= zImage.pseries diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index 6681ec3625c9..cf7631be5007 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -20,6 +20,8 @@ # -D dir specify directory containing data files used by script # (default ./arch/powerpc/boot) # -W dir specify working directory for temporary files (default .) +# -z use gzip (legacy) +# -Z zsuffixcompression to use (gz, xz or none) # Stop execution if any command fails set -e @@ -38,7 +40,7 @@ dtb= dts= cacheit= binary= -gzip=.gz +compression=.gz pie= format= @@ -59,7 +61,8 @@ tmpdir=. usage() { echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2 echo ' [-d devtree] [-s tree.dts] [-c] [-C cross-prefix]' >&2 -echo ' [-D datadir] [-W workingdir] [--no-gzip] [vmlinux]' >&2 +echo ' [-D datadir] [-W workingdir] [-Z (gz|xz|none)]' >&2 +echo ' [--no-compression] [vmlinux]' >&2 exit 1 } @@ -126,8 +129,24 @@ while [ "$#" -gt 0 ]; do [ "$#" -gt 0 ] || usage tmpdir="$1" ;; +-z) + compression=.gz + ;; +-Z) + shift + [ "$#" -gt 0 ] || usage +[ "$1" != "gz" -o "$1" != "xz" -o "$1" != "none" ] || usage + + compression=".$1" + +if [ $compression = ".none" ]; then +compression= +fi + ;; --no-gzip) -gzip= +# a "feature" of the the wrapper script is that it can be used outside +# the kernel tree. So keeping this around for backwards compatibility. +compression= ;; -?) usage @@ -140,6 +159,7 @@ while [ "$#" -gt 0 ]; do shift done + if [ -n "$dts" ]; then if [ ! -r "$dts" -a -r "$object/dts/$dts" ]; then dts="$object/dts/$dts" @@ -212,7 +232,7 @@ miboot|uboot*) ;; cuboot*) binary=y -gzip= +compression= case "$platform" in *-mpc866ads|*-mpc885ads|*-adder875*|*-ep88xc) platformo=$object/cuboot-8xx.o @@ -243,7 +263,7 @@ cuboot*) ps3) platformo="$object/ps3-head.o $object/ps3-hvcall.o $object/ps3.o" lds=$object/zImage.ps3.lds -gzip= +compression= ext=bin objflags="-O binary --set-section-flags=.bss=contents,alloc,load,data" ksection=.kernel:vmlinux.bin @@ -310,27 +330,37 @@ mvme7100) esac vmz="$tmpdir/`basename \"$kernel\"`.$ext" -if [ -z "$cacheit" -o ! -f "$vmz$gzip" -o "$vmz$gzip" -ot "$kernel" ]; then -${CROSS}objcopy $objflags "$kernel" "$vmz.$$" -strip_size=$(stat -c %s $vmz.$$) +# Calculate the vmlinux.strip size +${CROSS}objcopy $objflags "$kernel" "$vmz.$$" +strip_size=$(stat -c %s $vmz.$$) -if [ -n "$gzip" ]; then +if [ -z "$cacheit" -o ! -f "$vmz$compression" -o "$vmz$compression" -ot "$kernel" ]; then +# recompress the image if we need to +case $compression in +.xz) +xz --check=crc32 -f -9 "$vmz.$$" +;; +.gz) gzip -n -f -9 "$vmz.$$" -fi +;; +*) +# drop the compression suffix so the stripped vmlinux is used +compression= + ;; +esac if [
[PATCH 4/6] powerpc/boot: remove legacy gzip wrapper
This code is no longer used and can be removed. Signed-off-by: Oliver O'Halloran --- arch/powerpc/boot/gunzip_util.c | 204 arch/powerpc/boot/gunzip_util.h | 45 - 2 files changed, 249 deletions(-) delete mode 100644 arch/powerpc/boot/gunzip_util.c delete mode 100644 arch/powerpc/boot/gunzip_util.h diff --git a/arch/powerpc/boot/gunzip_util.c b/arch/powerpc/boot/gunzip_util.c deleted file mode 100644 index 9dc52501de83.. --- a/arch/powerpc/boot/gunzip_util.c +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Copyright 2007 David Gibson, IBM Corporation. - * Based on earlier work, Copyright (C) Paul Mackerras 1997. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include "string.h" -#include "stdio.h" -#include "ops.h" -#include "gunzip_util.h" - -#define HEAD_CRC 2 -#define EXTRA_FIELD4 -#define ORIG_NAME 8 -#define COMMENT0x10 -#define RESERVED 0xe0 - -/** - * gunzip_start - prepare to decompress gzip data - * @state: decompressor state structure to be initialized - * @src: buffer containing gzip compressed or uncompressed data - * @srclen:size in bytes of the buffer at src - * - * If the buffer at @src contains a gzip header, this function - * initializes zlib to decompress the data, storing the decompression - * state in @state. The other functions in this file can then be used - * to decompress data from the gzipped stream. - * - * If the buffer at @src does not contain a gzip header, it is assumed - * to contain uncompressed data. The buffer information is recorded - * in @state and the other functions in this file will simply copy - * data from the uncompressed data stream at @src. - * - * Any errors, such as bad compressed data, cause an error to be - * printed an the platform's exit() function to be called. - */ -void gunzip_start(struct gunzip_state *state, void *src, int srclen) -{ - char *hdr = src; - int hdrlen = 0; - - memset(state, 0, sizeof(*state)); - - /* Check for gzip magic number */ - if ((hdr[0] == 0x1f) && (hdr[1] == 0x8b)) { - /* gzip data, initialize zlib parameters */ - int r, flags; - - state->s.workspace = state->scratch; - if (zlib_inflate_workspacesize() > sizeof(state->scratch)) - fatal("insufficient scratch space for gunzip\n\r"); - - /* skip header */ - hdrlen = 10; - flags = hdr[3]; - if (hdr[2] != Z_DEFLATED || (flags & RESERVED) != 0) - fatal("bad gzipped data\n\r"); - if ((flags & EXTRA_FIELD) != 0) - hdrlen = 12 + hdr[10] + (hdr[11] << 8); - if ((flags & ORIG_NAME) != 0) - while (hdr[hdrlen++] != 0) - ; - if ((flags & COMMENT) != 0) - while (hdr[hdrlen++] != 0) - ; - if ((flags & HEAD_CRC) != 0) - hdrlen += 2; - if (hdrlen >= srclen) - fatal("gunzip_start: ran out of data in header\n\r"); - - r = zlib_inflateInit2(&state->s, -MAX_WBITS); - if (r != Z_OK) - fatal("inflateInit2 returned %d\n\r", r); - } - - state->s.total_in = hdrlen; - state->s.next_in = src + hdrlen; - state->s.avail_in = srclen - hdrlen; -} - -/** - * gunzip_partial - extract bytes from a gzip data stream - * @state: gzip state structure previously initialized by gunzip_start() - * @dst: buffer to store extracted data - * @dstlen:maximum number of bytes to extract - * - * This function extracts at most @dstlen bytes from the data stream - * previously associated with @state by gunzip_start(), decompressing - * if necessary. Exactly @dstlen bytes are extracted unless the data - * stream doesn't contain enough bytes, in which case the entire - * remainder of the stream is decompressed. - * - * Returns the actual number of bytes extracted. If any errors occur, - * such as a corrupted compressed stream, an error is printed an the - * platform's exit() function is called. - */ -int gunzip_partial(struct gunzip_state *state, void *dst, int dstlen) -{ - int len; - - if (state->s.workspace) { - /* gunzipping */ - int r; - - state->s.next_out = dst; - state->s.avail_out = dstlen; - r = zlib_inflate(&state->
[PATCH 3/6] powerpc/boot: use the preboot decompression API
Currently the powerpc boot wrapper has its own wrapper around zlib to handle decompressing gzipped kernels. The kernel decompressor library functions now provide a generic interface that can be used in the pre-boot environment. This allows boot wrappers to easily support different compression algorithms. This patch converts the wrapper to use this new API, but does not add support for using new algorithms. Signed-off-by: Oliver O'Halloran --- arch/powerpc/boot/Makefile | 10 ++- arch/powerpc/boot/decompress.c | 142 + arch/powerpc/boot/main.c | 35 +- arch/powerpc/boot/ops.h| 3 + 4 files changed, 170 insertions(+), 20 deletions(-) create mode 100644 arch/powerpc/boot/decompress.c diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 5a99a485d80a..3fdd74ac2fae 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -65,11 +65,12 @@ $(obj)/virtex405-head.o: BOOTAFLAGS += -mcpu=405 # the kernel's version of zlib pulls in a lot of other kernel headers # which we don't provide inside the wrapper. +zlib-decomp-$(CONFIG_KERNEL_GZIP) := decompress_inflate.c zlib-$(CONFIG_KERNEL_GZIP) := inffast.c inflate.c inftrees.c zlibheaders-$(CONFIG_KERNEL_GZIP) := inffast.h inffixed.h inflate.h inftrees.h infutil.h zliblinuxheader-$(CONFIG_KERNEL_GZIP) := zlib.h zconf.h zutil.h -$(addprefix $(obj)/,$(zlib-y) cuboot-c2k.o gunzip_util.o main.o): \ +$(addprefix $(obj)/,$(zlib-y) cuboot-c2k.o decompress.o main.o): \ $(addprefix $(obj)/,$(zliblinuxheader-y)) \ $(addprefix $(obj)/,$(zlibheaders-y)) \ $(addprefix $(obj)/,$(zlib-decomp-y)) @@ -80,10 +81,10 @@ libfdtheader := fdt.h libfdt.h libfdt_internal.h $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \ $(addprefix $(obj)/,$(libfdtheader)) -src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \ +src-wlib-y := string.S crt0.S crtsavres.S stdio.c decompress.c main.c \ $(libfdt) libfdt-wrapper.c \ ns16550.c serial.c simple_alloc.c div64.S util.S \ - gunzip_util.c elf_util.c $(zlib-y) devtree.c stdlib.c \ + decompress.o elf_util.c $(zlib-y) devtree.c stdlib.c \ oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \ uartlite.c mpc52xx-psc.c opal.c opal-calls.S src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c @@ -144,6 +145,9 @@ $(addprefix $(obj)/,$(zlibheaders-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/% $(addprefix $(obj)/,$(zliblinuxheader-y)): $(obj)/%: $(srctree)/include/linux/% $(call cmd,copy_kern_src) +$(addprefix $(obj)/,$(zlib-decomp-y)): $(obj)/%: $(srctree)/lib/% + $(call cmd,copy_kern_src) + quiet_cmd_copy_libfdt = COPY$@ cmd_copy_libfdt = cp $< $@ diff --git a/arch/powerpc/boot/decompress.c b/arch/powerpc/boot/decompress.c new file mode 100644 index ..60fc6fb26867 --- /dev/null +++ b/arch/powerpc/boot/decompress.c @@ -0,0 +1,142 @@ +/* + * Wrapper around the kernel's pre-boot decompression library. + * + * Copyright (C) IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "elf.h" +#include "page.h" +#include "string.h" +#include "stdio.h" +#include "ops.h" +#include "reg.h" +#include "types.h" + +/* + * The decompressor_*.c files play #ifdef games so they can be used in both + * pre-boot and regular kernel code. We need these definitions to make the + * includes work. + */ + +#define STATIC static +#define INIT +#define __always_inline inline + +/* + * The build process will copy the required zlib source files and headers + * out of lib/ and "fix" the includes so they do not pull in other kernel + * headers. + */ + +#ifdef CONFIG_KERNEL_GZIP +# include "decompress_inflate.c" +#endif + +/* globals for tracking the state of the decompression */ +static unsigned long decompressed_bytes; +static unsigned long limit; +static unsigned long skip; +static char *output_buffer; + +/* + * flush() is called by __decompress() when the decompressor's scratch buffer is + * full. + */ +static long flush(void *v, unsigned long buffer_size) +{ + unsigned long end = decompressed_bytes + buffer_size; + unsigned long size = buffer_size; + unsigned long offset = 0; + char *in = v; + char *out; + + /* +* if we hit our decompression limit, we need to fake an error to abort +* the in-progress decompression. +*/ + if (decompressed_bytes >= limit) + return -1; + + /* skip this entire block */ + if (end <= skip) { +
[PATCH 2/6] powerpc/boot: Use CONFIG_KERNEL_GZIP
Most architectures allow the compression algorithm used to produced the vmlinuz image to be selected as a kernel config option. In preperation for supporting algorithms other than gzip in the powerpc boot wrapper the makefile needs to be modified to use these config options. Signed-off-by: Oliver O'Halloran --- arch/powerpc/Kconfig | 1 + arch/powerpc/boot/Makefile | 31 +++ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 927d2ab2ce08..9f0568852ecf 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -167,6 +167,7 @@ config PPC select GENERIC_CPU_AUTOPROBE select HAVE_VIRT_CPU_ACCOUNTING select HAVE_ARCH_HARDENED_USERCOPY + select HAVE_KERNEL_GZIP config GENERIC_CSUM def_bool CPU_LITTLE_ENDIAN diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index f98e42ee2534..5a99a485d80a 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -19,10 +19,14 @@ all: $(obj)/zImage +compress-$(CONFIG_KERNEL_GZIP) := CONFIG_KERNEL_GZIP + BOOTCFLAGS:= -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -Os -msoft-float -pipe \ -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \ --isystem $(shell $(CROSS32CC) -print-file-name=include) +-isystem $(shell $(CROSS32CC) -print-file-name=include) \ +-D$(compress-y) + ifdef CONFIG_PPC64_BOOT_WRAPPER BOOTCFLAGS += -m64 endif @@ -59,13 +63,16 @@ $(obj)/treeboot-currituck.o: BOOTCFLAGS += -mcpu=405 $(obj)/treeboot-akebono.o: BOOTCFLAGS += -mcpu=405 $(obj)/virtex405-head.o: BOOTAFLAGS += -mcpu=405 +# the kernel's version of zlib pulls in a lot of other kernel headers +# which we don't provide inside the wrapper. +zlib-$(CONFIG_KERNEL_GZIP) := inffast.c inflate.c inftrees.c +zlibheaders-$(CONFIG_KERNEL_GZIP) := inffast.h inffixed.h inflate.h inftrees.h infutil.h +zliblinuxheader-$(CONFIG_KERNEL_GZIP) := zlib.h zconf.h zutil.h -zlib := inffast.c inflate.c inftrees.c -zlibheader := inffast.h inffixed.h inflate.h inftrees.h infutil.h -zliblinuxheader := zlib.h zconf.h zutil.h - -$(addprefix $(obj)/,$(zlib) cuboot-c2k.o gunzip_util.o main.o): \ - $(addprefix $(obj)/,$(zliblinuxheader)) $(addprefix $(obj)/,$(zlibheader)) +$(addprefix $(obj)/,$(zlib-y) cuboot-c2k.o gunzip_util.o main.o): \ + $(addprefix $(obj)/,$(zliblinuxheader-y)) \ + $(addprefix $(obj)/,$(zlibheaders-y)) \ + $(addprefix $(obj)/,$(zlib-decomp-y)) libfdt := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c libfdtheader := fdt.h libfdt.h libfdt_internal.h @@ -76,7 +83,7 @@ $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \ src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \ $(libfdt) libfdt-wrapper.c \ ns16550.c serial.c simple_alloc.c div64.S util.S \ - gunzip_util.c elf_util.c $(zlib) devtree.c stdlib.c \ + gunzip_util.c elf_util.c $(zlib-y) devtree.c stdlib.c \ oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \ uartlite.c mpc52xx-psc.c opal.c opal-calls.S src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c @@ -128,13 +135,13 @@ obj-plat: $(libfdt) quiet_cmd_copy_kern_src = COPY$@ cmd_copy_kern_src = sed -f $(srctree)/arch/powerpc/boot/fixup-headers.sed $< > $@ -$(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/% +$(addprefix $(obj)/,$(zlib-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/% $(call cmd,copy_kern_src) -$(addprefix $(obj)/,$(zlibheaders)): $(obj)/%: $(srctree)/lib/zlib_inflate/% +$(addprefix $(obj)/,$(zlibheaders-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/% $(call cmd,copy_kern_src) -$(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/% +$(addprefix $(obj)/,$(zliblinuxheader-y)): $(obj)/%: $(srctree)/include/linux/% $(call cmd,copy_kern_src) quiet_cmd_copy_libfdt = COPY$@ @@ -153,7 +160,7 @@ $(obj)/zImage.lds: $(obj)/%: $(srctree)/$(src)/%.S $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds : $(obj)/%: $(srctree)/$(src)/%.S @cp $< $@ -clean-files := $(zlib) $(zlibheader) $(zliblinuxheader) \ +clean-files := $(zlib-y) $(zlibheaders-y) $(zliblinuxheader-y) \ $(libfdt) $(libfdtheader) \ empty.c zImage.coff.lds zImage.ps3.lds zImage.lds -- 2.5.5
XZ compressed zImage support
This series adds support for using XZ compression in addition to gzip in the kernel boot wrapper. Currently this is only enabled for 64bit Book3S processors since it seems that some embedded platforms rely on uBoot (or similar) to decompress the image rather than having the kernel decompress itself. Enabling it for other platforms should be fairly straight forward though. Supporting other compression algorithms (like ARM and x86 do) is possible, but painful. Each algorithm includes some kernel headers even when the #defines that are supposed to make them usable in a pre-boot environment are set. Including kernel headers is an issue because on powerpc the boot wrapper is compiled with a different toolchain and possibly for a different target for backwards compatibility reasons*. This makes it difficult to include kernel headers since the include paths, etc are not setup for BOOTCC. This can be worked around by rewriting parts of the each decompressor with sed scripts, but the rewriting requried is specific to each decompressor. -oliver *powermacs have 32bit firmware that cannot directly load a 64bit kernel. A 64 bit big endian kernel has a 32bit wrapper to work around this. On 64bit little endian we don't have this legacy problem so the wrapper is also 64bit little endian, but the toolchain issues are still there.
[PATCH 1/6] powerpc/boot: add sed script
The powerpc boot wrapper is compiled with a separate "bootcc" toolchain rather than the toolchain used for the rest of the kernel. The main problem with this is that the wrapper does not have access to the kernel headers (without a lot of gross hacks). To get around this the required headers are copied into the build directory via several sed scripts which rewrite problematic includes. This patch moves these fixups out of the makefile into a separate .sed script file to clean up makefile slightly. Signed-off-by: Oliver O'Halloran --- arch/powerpc/boot/Makefile | 18 ++ arch/powerpc/boot/fixup-headers.sed | 12 2 files changed, 18 insertions(+), 12 deletions(-) create mode 100644 arch/powerpc/boot/fixup-headers.sed diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 1a2a6e8dc40d..f98e42ee2534 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -125,23 +125,17 @@ obj-wlib := $(addsuffix .o, $(basename $(addprefix $(obj)/, $(src-wlib obj-plat := $(addsuffix .o, $(basename $(addprefix $(obj)/, $(src-plat obj-plat: $(libfdt) -quiet_cmd_copy_zlib = COPY$@ - cmd_copy_zlib = sed "s@__used@@;s@]*\).*@\"\1\"@" $< > $@ - -quiet_cmd_copy_zlibheader = COPY$@ - cmd_copy_zlibheader = sed "s@]*\).*@\"\1\"@" $< > $@ -# stddef.h for NULL -quiet_cmd_copy_zliblinuxheader = COPY$@ - cmd_copy_zliblinuxheader = sed "s@@\"string.h\"@;s@@@;s@]*\).*@\"\1\"@" $< > $@ +quiet_cmd_copy_kern_src = COPY$@ + cmd_copy_kern_src = sed -f $(srctree)/arch/powerpc/boot/fixup-headers.sed $< > $@ $(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/% - $(call cmd,copy_zlib) + $(call cmd,copy_kern_src) -$(addprefix $(obj)/,$(zlibheader)): $(obj)/%: $(srctree)/lib/zlib_inflate/% - $(call cmd,copy_zlibheader) +$(addprefix $(obj)/,$(zlibheaders)): $(obj)/%: $(srctree)/lib/zlib_inflate/% + $(call cmd,copy_kern_src) $(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/% - $(call cmd,copy_zliblinuxheader) + $(call cmd,copy_kern_src) quiet_cmd_copy_libfdt = COPY$@ cmd_copy_libfdt = cp $< $@ diff --git a/arch/powerpc/boot/fixup-headers.sed b/arch/powerpc/boot/fixup-headers.sed new file mode 100644 index ..96362428eb37 --- /dev/null +++ b/arch/powerpc/boot/fixup-headers.sed @@ -0,0 +1,12 @@ +# Copyright 2016 IBM Corporation. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License version 2 or later as +# published by the Free Software Foundation. + +s@#include @@; +s@\"zlib_inflate/\([^\"]*\).*@"\1"@; +s@@@; + +s@__used@@; +s@]*\).*@"\1"@; -- 2.5.5
Re: [PATCH v5 04/13] powerpc: Factor out relocation code from module_64.c to elf_util_64.c.
On Tue, Aug 23, 2016 at 1:21 PM, Balbir Singh wrote: > >> zImage on ppc64 BE is an ELF32 file. This patch set only supports loading >> ELF files of the same class as the kernel, so a 64 bit kernel can't load an >> ELF32 file. It would be possible to add such support, but it would be a new >> feature. >> >> The distros I was able to check on ppc64 LE and BE all use vmlinux. >> kexec-tools with kexec_load also doesn't support zImage. Do you think it is >> important to support zImage? > > Well if it didn't work already, I think its low priority. Michael should be > able to confirm this. Oliver's been trying to cleanup the zImage to get rid > the old zImage limitation, cc'ing him I don't think it's ever worked so I wouldn't worry too much about supporting it. Fixing kexec-into-zImage and fixing the 32bit wrapper on 64bit BE kernel problem has been on my TODO list for a while, but it's not a priority. oliver
[PATCH v2] powerpc/mm: Add a parameter to disable 1TB segs
This patch adds the kernel command line parameter "no_tb_segs" which forces the kernel to use 256MB rather than 1TB segments. Forcing the use of 256MB segments makes it considerably easier to test code that depends on an SLB miss occurring. Suggested-by: Michael Neuling Suggested-by: Michael Ellerman Signed-off-by: Oliver O'Halloran --- Changes in from v1: Renamed parameter from "no_tb_segs" to "disable_1tb_segments" Added kernel-parameters.txt entry Documentation/kernel-parameters.txt | 6 ++ arch/powerpc/mm/hash_utils_64.c | 15 +++ 2 files changed, 21 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4640ea2dce9b..3be08fda82dd 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -920,6 +920,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. dhash_entries= [KNL] Set number of hash buckets for dentry cache. + disable_1tb_segments [PPC] + Disables the use of 1TB hash page table segments. This + causes the kernel to fall back to 256MB segments which + can be useful when debugging issues that require an SLB + miss to occur. + disable=[IPV6] See Documentation/networking/ipv6.txt. diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 5b22ba0b58bc..7e6d38e01645 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -321,6 +321,15 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend, return ret; } +static bool disable_1tb_segments = false; + +static int __init parse_disable_1tb_segments(char *p) +{ + disable_1tb_segments = true; + return 0; +} +early_param("disable_1tb_segments", parse_disable_1tb_segments); + static int __init htab_dt_scan_seg_sizes(unsigned long node, const char *uname, int depth, void *data) @@ -339,6 +348,12 @@ static int __init htab_dt_scan_seg_sizes(unsigned long node, for (; size >= 4; size -= 4, ++prop) { if (be32_to_cpu(prop[0]) == 40) { DBG("1T segment support detected\n"); + + if (disable_1tb_segments) { + DBG("1T segments disabled by command line\n"); + break; + } + cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT; return 1; } -- 2.5.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc/mm: Add a parameter to disable 1TB segs
This patch adds the kernel command line parameter "no_tb_segs" which forces the kernel to use 256MB rather than 1TB segments. Forcing the use of 256MB segments makes it considerably easier to test code that depends on an SLB miss occurring. Suggested-by: Michael Neuling Suggested-by: Michael Ellerman Signed-off-by: Oliver O'Halloran --- arch/powerpc/mm/hash_utils_64.c | 15 +++ 1 file changed, 15 insertions(+) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 5b22ba0b58bc..6da1a9d18e15 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -321,6 +321,15 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend, return ret; } +static bool no_tb_segs = false; + +static int __init parse_no_tb_segs(char *p) +{ + no_tb_segs = true; + return 0; +} +early_param("no_tb_segs", parse_no_tb_segs); + static int __init htab_dt_scan_seg_sizes(unsigned long node, const char *uname, int depth, void *data) @@ -339,6 +348,12 @@ static int __init htab_dt_scan_seg_sizes(unsigned long node, for (; size >= 4; size -= 4, ++prop) { if (be32_to_cpu(prop[0]) == 40) { DBG("1T segment support detected\n"); + + if (no_tb_segs) { + DBG("Forcing 256MB segments\n"); + break; + } + cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT; return 1; } -- 2.5.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v6] powerpc/timer - large decrementer support
Power ISAv3 adds a large decrementer (LD) mode which increases the size of the decrementer register. The size of the enlarged decrementer register is between 32 and 64 bits with the exact size being dependent on the implementation. When in LD mode, reads are sign extended to 64 bits and a decrementer exception is raised when the high bit is set (i.e the value goes below zero). Writes however are truncated to the physical register width so some care needs to be taken to ensure that the high bit is not set when reloading the decrementer. This patch adds support for using the LD inside the host kernel on processors that support it. When LD mode is supported firmware will supply the ibm,dec-bits property for CPU nodes to allow the kernel to determine the maximum decrementer value. Enabling LD mode is a hypervisor privileged operation so the kernel can only enable it manually when running in hypervisor mode. Guests that support LD mode can request it using the "ibm,client-architecture-support" firmware call (not implemented in this patch) or some other platform specific method. If this property is not supplied then the traditional decrementer width of 32 bit is assumed and LD mode will not be enabled. This patch was based on initial work by Jack Miller. Signed-off-by: Oliver O'Halloran Signed-off-by: Balbir Singh Acked-by: Michael Neuling Cc: Jack Miller --- Changes from v5: Removed readback test after enabling LD mode since mikey thought it was dumb. Replaced use of of_get_property() and of_read_number() with of_property_read_u32() --- arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/include/asm/time.h | 6 ++-- arch/powerpc/kernel/time.c | 67 - 3 files changed, 63 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index a0948f40bc7b..12d970d64bb3 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -332,6 +332,7 @@ #define LPCR_AIL_0 0x /* MMU off exception offset 0x0 */ #define LPCR_AIL_3 0x0180 /* MMU on exception offset 0xc00...4xxx */ #define LPCR_ONL 0x0004 /* online - PURR/SPURR count */ +#define LPCR_LD 0x0002 /* large decremeter */ #define LPCR_PECE0x0001f000 /* powersave exit cause enable */ #define LPCR_PECEDP0x0001 /* directed priv dbells cause exit */ #define LPCR_PECEDH0x8000 /* directed hyp dbells cause exit */ diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 1092fdd7e737..09211640a0e0 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int lower) * in auto-reload mode. The problem is PIT stops counting when it * hits zero. If it would wrap, we could use it just like a decrementer. */ -static inline unsigned int get_dec(void) +static inline u64 get_dec(void) { #if defined(CONFIG_40x) return (mfspr(SPRN_PIT)); @@ -160,10 +160,10 @@ static inline unsigned int get_dec(void) * in when the decrementer generates its interrupt: on the 1 to 0 * transition for Book E/4xx, but on the 0 to -1 transition for others. */ -static inline void set_dec(int val) +static inline void set_dec(u64 val) { #if defined(CONFIG_40x) - mtspr(SPRN_PIT, val); + mtspr(SPRN_PIT, (u32) val); #else #ifndef CONFIG_BOOKE --val; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 7a482a7f4d8d..d1cb44ddfc95 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -97,7 +97,8 @@ static struct clocksource clocksource_timebase = { .read = timebase_read, }; -#define DECREMENTER_MAX0x7fff +#define DECREMENTER_DEFAULT_MAX 0x7FFF +u64 decrementer_max = DECREMENTER_DEFAULT_MAX; static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev); @@ -505,8 +506,8 @@ static void __timer_interrupt(void) __this_cpu_inc(irq_stat.timer_irqs_event); } else { now = *next_tb - now; - if (now <= DECREMENTER_MAX) - set_dec((int)now); + if (now <= decrementer_max) + set_dec(now); /* We may have raced with new irq work */ if (test_irq_work_pending()) set_dec(1); @@ -536,7 +537,7 @@ void timer_interrupt(struct pt_regs * regs) /* Ensure a positive value is written to the decrementer, or else * some CPUs will continue to take decrementer exceptions. */ - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); /* Some implementations of hotplug will get timer interrupts while * offline, jus
[PATCH v3] powerpc/boot: Add OPAL console to epapr wrappers
This patch adds an OPAL console backend to the powerpc boot wrapper so that decompression failures inside the wrapper can be reported to the user. This is important since it typically indicates data corruption in the firmware and other nasty things. Currently this only works when building a little endian kernel. When compiling a 64 bit BE kernel the wrapper is always build 32 bit to be compatible with some 32 bit firmwares. BE support will be added at a later date. Another limitation of this is that only the "raw" type of OPAL console is supported, however machines that provide a hvsi console also provide a raw console so this is not an issue in practice. Actually-written-by: Benjamin Herrenschmidt Signed-off-by: Oliver O'Halloran Cc: Stewart Smith Cc: sta...@vger.kernel.org --- Changelog: v2: Added missing files v3: Added copyright headers to opal.c and opal-calls.S --- arch/powerpc/boot/Makefile | 4 +- arch/powerpc/boot/opal-calls.S | 58 + arch/powerpc/boot/opal.c | 97 ++ arch/powerpc/boot/ops.h| 1 + arch/powerpc/boot/ppc_asm.h| 4 ++ arch/powerpc/boot/serial.c | 2 + arch/powerpc/boot/types.h | 10 + 7 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 arch/powerpc/boot/opal-calls.S create mode 100644 arch/powerpc/boot/opal.c diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 8fe78a3efc92..00cf88aa9a23 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -70,7 +70,7 @@ $(addprefix $(obj)/,$(zlib) cuboot-c2k.o gunzip_util.o main.o): \ libfdt := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c libfdtheader := fdt.h libfdt.h libfdt_internal.h -$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o): \ +$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \ $(addprefix $(obj)/,$(libfdtheader)) src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \ @@ -78,7 +78,7 @@ src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \ ns16550.c serial.c simple_alloc.c div64.S util.S \ gunzip_util.c elf_util.c $(zlib) devtree.c stdlib.c \ oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \ - uartlite.c mpc52xx-psc.c + uartlite.c mpc52xx-psc.c opal.c opal-calls.S src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c src-wlib-$(CONFIG_44x) += 4xx.c ebony.c bamboo.c src-wlib-$(CONFIG_8xx) += mpc8xx.c planetcore.c fsl-soc.c diff --git a/arch/powerpc/boot/opal-calls.S b/arch/powerpc/boot/opal-calls.S new file mode 100644 index ..ff2f1b97bc53 --- /dev/null +++ b/arch/powerpc/boot/opal-calls.S @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2016 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "ppc_asm.h" +#include "../include/asm/opal-api.h" + + .text + +#define OPAL_CALL(name, token) \ + .globl name;\ +name: \ + li r0, token; \ + b opal_call; + +opal_call: + mflrr11 + std r11,16(r1) + mfcrr12 + stw r12,8(r1) + mr r13,r2 + + /* Set opal return address */ + ld r11,opal_return@got(r2) + mtlrr11 + mfmsr r12 + + /* switch to BE when we enter OPAL */ + li r11,MSR_LE + andcr12,r12,r11 + mtspr SPRN_HSRR1,r12 + + /* load the opal call entry point and base */ + ld r11,opal@got(r2) + ld r12,8(r11) + ld r2,0(r11) + mtspr SPRN_HSRR0,r12 + hrfid + +opal_return: + FIXUP_ENDIAN + mr r2,r13; + lwz r11,8(r1); + ld r12,16(r1) + mtcrr11; + mtlrr12 + blr + +OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); +OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); +OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE); +OPAL_CALL(opal_poll_events,OPAL_POLL_EVENTS); +OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH); diff --git a/arch/powerpc/boot/opal.c b/arch/powerpc/boot/opal.c new file mode 100644 index ..3a2ce1e1f048 --- /dev/null +++ b/arch/powerpc/boot/opal.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2016 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or
[PATCH v2] powerpc/boot: Add OPAL console to epapr wrappers
This patch adds an OPAL console backend to the powerpc boot wrapper so that decompression failures inside the wrapper can be reported to the user. This is important since it typically indicates data corruption in the firmware and other nasty things. Currently this only works when building a little endian kernel. When compiling a 64 bit BE kernel the wrapper is always build 32 bit to be compatible with some 32 bit firmwares. BE support will be added at a later date. Another limitation of this is that only the "raw" type of OPAL console is supported, however machines that provide a hvsi console also provide a raw console so this is not an issue in practice. Actually-written-by: Benjamin Herrenschmidt Signed-off-by: Oliver O'Halloran Cc: Stewart Smith Cc: sta...@vger.kernel.org --- arch/powerpc/boot/Makefile | 4 +- arch/powerpc/boot/opal-calls.S | 49 +++ arch/powerpc/boot/opal.c | 88 ++ arch/powerpc/boot/ops.h| 1 + arch/powerpc/boot/ppc_asm.h| 4 ++ arch/powerpc/boot/serial.c | 2 + arch/powerpc/boot/types.h | 12 ++ 7 files changed, 158 insertions(+), 2 deletions(-) create mode 100644 arch/powerpc/boot/opal-calls.S create mode 100644 arch/powerpc/boot/opal.c diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 8fe78a3efc92..00cf88aa9a23 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -70,7 +70,7 @@ $(addprefix $(obj)/,$(zlib) cuboot-c2k.o gunzip_util.o main.o): \ libfdt := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c libfdtheader := fdt.h libfdt.h libfdt_internal.h -$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o): \ +$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \ $(addprefix $(obj)/,$(libfdtheader)) src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \ @@ -78,7 +78,7 @@ src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \ ns16550.c serial.c simple_alloc.c div64.S util.S \ gunzip_util.c elf_util.c $(zlib) devtree.c stdlib.c \ oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \ - uartlite.c mpc52xx-psc.c + uartlite.c mpc52xx-psc.c opal.c opal-calls.S src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c src-wlib-$(CONFIG_44x) += 4xx.c ebony.c bamboo.c src-wlib-$(CONFIG_8xx) += mpc8xx.c planetcore.c fsl-soc.c diff --git a/arch/powerpc/boot/opal-calls.S b/arch/powerpc/boot/opal-calls.S new file mode 100644 index ..1f3c097e1552 --- /dev/null +++ b/arch/powerpc/boot/opal-calls.S @@ -0,0 +1,49 @@ +#include "ppc_asm.h" +#include "../include/asm/opal-api.h" + + .text + +#define OPAL_CALL(name, token) \ + .globl name;\ +name: \ + li r0, token; \ + b opal_call; + +opal_call: + mflrr11 + std r11,16(r1) + mfcrr12 + stw r12,8(r1) + mr r13,r2 + + /* Set opal return address */ + ld r11,opal_return@got(r2) + mtlrr11 + mfmsr r12 + + /* switch to BE when we enter OPAL */ + li r11,MSR_LE + andcr12,r12,r11 + mtspr SPRN_HSRR1,r12 + + /* load the opal call entry point and base */ + ld r11,opal@got(r2) + ld r12,8(r11) + ld r2,0(r11) + mtspr SPRN_HSRR0,r12 + hrfid + +opal_return: + FIXUP_ENDIAN + mr r2,r13; + lwz r11,8(r1); + ld r12,16(r1) + mtcrr11; + mtlrr12 + blr + +OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); +OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); +OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE); +OPAL_CALL(opal_poll_events,OPAL_POLL_EVENTS); +OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH); diff --git a/arch/powerpc/boot/opal.c b/arch/powerpc/boot/opal.c new file mode 100644 index ..d0f54443caa9 --- /dev/null +++ b/arch/powerpc/boot/opal.c @@ -0,0 +1,88 @@ +#include "ops.h" +#include "stdio.h" +#include "io.h" +#include +#include "../include/asm/opal-api.h" + +/* Global OPAL struct used by opal-call.S */ +struct opal { + u64 base; + u64 entry; +} opal; + +static u32 opal_con_id; + +int64_t opal_console_write(int64_t term_number, u64 *length, const u8 *buffer); +int64_t opal_console_read(int64_t term_number, uint64_t *length, u8 *buffer); +int64_t opal_console_write_buffer_space(uint64_t term_number, uint64_t *length); +int64_t opal_console_flush(uint64_t term_number); +int64_t opal_poll_events(uint64_t *outstanding_event_mask); + +static int opal_con_open
[PATCH] powerpc/boot: Add OPAL console to epapr wrappers
This patch adds an OPAL console backend to the powerpc boot wrapper so that decompression failures inside the wrapper can be reported to the user. This is important since it typically indicates data corruption in the firmware and other nasty things. Currently this only works when building a little endian kernel. When compiling a 64 bit BE kernel the wrapper is always build 32 bit to be compatible with some 32 bit firmwares. BE support will be added at a later date. Another limitation of this is that only the "raw" type of OPAL console is supported, however machines that provide a hvsi console also provide a raw console so this is not an issue in practice. Actually-written-by: Benjamin Herrenschmidt Signed-off-by: Oliver O'Halloran Cc: Stewart Smith Cc: sta...@vger.kernel.org --- arch/powerpc/boot/Makefile | 4 ++-- arch/powerpc/boot/ops.h | 1 + arch/powerpc/boot/ppc_asm.h | 4 arch/powerpc/boot/serial.c | 2 ++ arch/powerpc/boot/types.h | 12 5 files changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 8fe78a3efc92..00cf88aa9a23 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -70,7 +70,7 @@ $(addprefix $(obj)/,$(zlib) cuboot-c2k.o gunzip_util.o main.o): \ libfdt := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c libfdtheader := fdt.h libfdt.h libfdt_internal.h -$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o): \ +$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \ $(addprefix $(obj)/,$(libfdtheader)) src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \ @@ -78,7 +78,7 @@ src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \ ns16550.c serial.c simple_alloc.c div64.S util.S \ gunzip_util.c elf_util.c $(zlib) devtree.c stdlib.c \ oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \ - uartlite.c mpc52xx-psc.c + uartlite.c mpc52xx-psc.c opal.c opal-calls.S src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c src-wlib-$(CONFIG_44x) += 4xx.c ebony.c bamboo.c src-wlib-$(CONFIG_8xx) += mpc8xx.c planetcore.c fsl-soc.c diff --git a/arch/powerpc/boot/ops.h b/arch/powerpc/boot/ops.h index 5e75e1c5518e..e19b64ef977a 100644 --- a/arch/powerpc/boot/ops.h +++ b/arch/powerpc/boot/ops.h @@ -89,6 +89,7 @@ int mpsc_console_init(void *devp, struct serial_console_data *scdp); int cpm_console_init(void *devp, struct serial_console_data *scdp); int mpc5200_psc_console_init(void *devp, struct serial_console_data *scdp); int uartlite_console_init(void *devp, struct serial_console_data *scdp); +int opal_console_init(void *devp, struct serial_console_data *scdp); void *simple_alloc_init(char *base, unsigned long heap_size, unsigned long granularity, unsigned long max_allocs); extern void flush_cache(void *, unsigned long); diff --git a/arch/powerpc/boot/ppc_asm.h b/arch/powerpc/boot/ppc_asm.h index 35ea60c1f070..b03373d8b386 100644 --- a/arch/powerpc/boot/ppc_asm.h +++ b/arch/powerpc/boot/ppc_asm.h @@ -61,6 +61,10 @@ #define SPRN_TBRL 268 #define SPRN_TBRU 269 +#define SPRN_HSRR0 0x13A /* Hypervisor Save/Restore 0 */ +#define SPRN_HSRR1 0x13B /* Hypervisor Save/Restore 1 */ + +#define MSR_LE 0x0001 #define FIXUP_ENDIAN \ tdi 0, 0, 0x48; /* Reverse endian of b . + 8 */ \ diff --git a/arch/powerpc/boot/serial.c b/arch/powerpc/boot/serial.c index 167ee9433de6..e04c1e4063ae 100644 --- a/arch/powerpc/boot/serial.c +++ b/arch/powerpc/boot/serial.c @@ -132,6 +132,8 @@ int serial_console_init(void) else if (dt_is_compatible(devp, "xlnx,opb-uartlite-1.00.b") || dt_is_compatible(devp, "xlnx,xps-uartlite-1.00.a")) rc = uartlite_console_init(devp, &serial_cd); + else if (dt_is_compatible(devp, "ibm,opal-console-raw")) + rc = opal_console_init(devp, &serial_cd); /* Add other serial console driver calls here */ diff --git a/arch/powerpc/boot/types.h b/arch/powerpc/boot/types.h index 31393d17a9c1..cda474cd63c8 100644 --- a/arch/powerpc/boot/types.h +++ b/arch/powerpc/boot/types.h @@ -12,6 +12,18 @@ typedef shorts16; typedef ints32; typedef long long s64; + +/* required for opal-api.h */ +typedef u8 uint8_t; +typedef u16 uint16_t; +typedef u32 uint32_t; +typedef u64 uint64_t; +typedef s8 int8_t; +typedef s16 int16_t; +typedef s32 int32_t; +typedef s64 int64_t; + + #define min(x,y) ({ \ typeof(x) _x = (x); \ typeof(y) _y = (y); \ -- 2.5.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5] powerpc/timer - large decrementer support
Power ISAv3 adds a large decrementer (LD) mode which increases the size of the decrementer register. The size of the enlarged decrementer register is between 32 and 64 bits with the exact size being dependent on the implementation. When in LD mode, reads are sign extended to 64 bits and a decrementer exception is raised when the high bit is set (i.e the value goes below zero). Writes however are truncated to the physical register width so some care needs to be taken to ensure that the high bit is not set when reloading the decrementer. This patch adds support for using the LD inside the host kernel on processors that support it. When LD mode is supported firmware will supply the ibm,dec-bits property for CPU nodes to allow the kernel to determine the maximum decrementer value. Enabling LD mode is a hypervisor privileged operation so the kernel can only enable it manually when running in hypervisor mode. Guest kernels that support LD mode can request it using the "ibm,client-architecture-support" firmware call or some other platform specific method. If this property is not supplied then the traditional decrementer width of 32 bit is assumed and LD mode will not be enabled. This patch was based on initial work by Jack Miller. Signed-off-by: Oliver O'Halloran Signed-off-by: Balbir Singh Cc: Michael Neuling Cc: Jack Miller --- arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/include/asm/time.h | 6 +-- arch/powerpc/kernel/time.c | 104 3 files changed, 100 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index a0948f40bc7b..12d970d64bb3 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -332,6 +332,7 @@ #define LPCR_AIL_0 0x /* MMU off exception offset 0x0 */ #define LPCR_AIL_3 0x0180 /* MMU on exception offset 0xc00...4xxx */ #define LPCR_ONL 0x0004 /* online - PURR/SPURR count */ +#define LPCR_LD 0x0002 /* large decremeter */ #define LPCR_PECE0x0001f000 /* powersave exit cause enable */ #define LPCR_PECEDP0x0001 /* directed priv dbells cause exit */ #define LPCR_PECEDH0x8000 /* directed hyp dbells cause exit */ diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 1092fdd7e737..09211640a0e0 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int lower) * in auto-reload mode. The problem is PIT stops counting when it * hits zero. If it would wrap, we could use it just like a decrementer. */ -static inline unsigned int get_dec(void) +static inline u64 get_dec(void) { #if defined(CONFIG_40x) return (mfspr(SPRN_PIT)); @@ -160,10 +160,10 @@ static inline unsigned int get_dec(void) * in when the decrementer generates its interrupt: on the 1 to 0 * transition for Book E/4xx, but on the 0 to -1 transition for others. */ -static inline void set_dec(int val) +static inline void set_dec(u64 val) { #if defined(CONFIG_40x) - mtspr(SPRN_PIT, val); + mtspr(SPRN_PIT, (u32) val); #else #ifndef CONFIG_BOOKE --val; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 7a482a7f4d8d..efebe52133ef 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -97,7 +97,8 @@ static struct clocksource clocksource_timebase = { .read = timebase_read, }; -#define DECREMENTER_MAX0x7fff +#define DECREMENTER_DEFAULT_MAX 0x7FFF +u64 decrementer_max = DECREMENTER_DEFAULT_MAX; static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev); @@ -505,8 +506,8 @@ static void __timer_interrupt(void) __this_cpu_inc(irq_stat.timer_irqs_event); } else { now = *next_tb - now; - if (now <= DECREMENTER_MAX) - set_dec((int)now); + if (now <= decrementer_max) + set_dec(now); /* We may have raced with new irq work */ if (test_irq_work_pending()) set_dec(1); @@ -536,7 +537,7 @@ void timer_interrupt(struct pt_regs * regs) /* Ensure a positive value is written to the decrementer, or else * some CPUs will continue to take decrementer exceptions. */ - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); /* Some implementations of hotplug will get timer interrupts while * offline, just ignore these and we also need to set @@ -584,9 +585,9 @@ static void generic_suspend_disable_irqs(void) * with suspending. */ - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); local_irq_disab
[PATCHv4] powerpc/timer - large decrementer support
Power ISAv3 adds a large decrementer (LD) mode which increases the size of the decrementer register. The size of the enlarged decrementer register is between 32 and 64 bits with the exact size being dependent on the implementation. When in LD mode, reads are sign extended to 64 bits and a decrementer exception is raised when the high bit is set (i.e the value goes below zero). Writes however are truncated to the physical register width so some care needs to be taken to ensure that the high bit is not set when reloading the decrementer. This patch adds support for using the LD inside the host kernel on processors that support it. When LD mode is supported firmware will supply the ibm,dec-bits property for CPU nodes to allow the kernel to determine the maximum decrementer value. Enabling LD mode is a hypervisor privileged operation so the kernel can only enable it manually when running in hypervisor mode. Guest kernels that support LD mode can request it using the "ibm,client-architecture-support" firmware call or some other platform specific method. If this property is not supplied then the traditional decrementer width of 32 bit is assumed and LD mode will not be enabled. This patch was based on initial work by Jack Miller. Signed-off-by: Oliver O'Halloran Signed-off-by: Balbir Singh Cc: Michael Neuling Cc: Jack Miller --- arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/include/asm/time.h | 6 +-- arch/powerpc/kernel/time.c | 102 3 files changed, 98 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index a0948f40bc7b..12d970d64bb3 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -332,6 +332,7 @@ #define LPCR_AIL_0 0x /* MMU off exception offset 0x0 */ #define LPCR_AIL_3 0x0180 /* MMU on exception offset 0xc00...4xxx */ #define LPCR_ONL 0x0004 /* online - PURR/SPURR count */ +#define LPCR_LD 0x0002 /* large decremeter */ #define LPCR_PECE0x0001f000 /* powersave exit cause enable */ #define LPCR_PECEDP0x0001 /* directed priv dbells cause exit */ #define LPCR_PECEDH0x8000 /* directed hyp dbells cause exit */ diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 1092fdd7e737..09211640a0e0 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int lower) * in auto-reload mode. The problem is PIT stops counting when it * hits zero. If it would wrap, we could use it just like a decrementer. */ -static inline unsigned int get_dec(void) +static inline u64 get_dec(void) { #if defined(CONFIG_40x) return (mfspr(SPRN_PIT)); @@ -160,10 +160,10 @@ static inline unsigned int get_dec(void) * in when the decrementer generates its interrupt: on the 1 to 0 * transition for Book E/4xx, but on the 0 to -1 transition for others. */ -static inline void set_dec(int val) +static inline void set_dec(u64 val) { #if defined(CONFIG_40x) - mtspr(SPRN_PIT, val); + mtspr(SPRN_PIT, (u32) val); #else #ifndef CONFIG_BOOKE --val; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 7a482a7f4d8d..aa6d399d939b 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -97,7 +97,8 @@ static struct clocksource clocksource_timebase = { .read = timebase_read, }; -#define DECREMENTER_MAX0x7fff +#define DECREMENTER_DEFAULT_MAX 0x7FFF +u64 decrementer_max = DECREMENTER_DEFAULT_MAX; static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev); @@ -505,8 +506,8 @@ static void __timer_interrupt(void) __this_cpu_inc(irq_stat.timer_irqs_event); } else { now = *next_tb - now; - if (now <= DECREMENTER_MAX) - set_dec((int)now); + if (now <= decrementer_max) + set_dec(now); /* We may have raced with new irq work */ if (test_irq_work_pending()) set_dec(1); @@ -536,7 +537,7 @@ void timer_interrupt(struct pt_regs * regs) /* Ensure a positive value is written to the decrementer, or else * some CPUs will continue to take decrementer exceptions. */ - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); /* Some implementations of hotplug will get timer interrupts while * offline, just ignore these and we also need to set @@ -584,9 +585,9 @@ static void generic_suspend_disable_irqs(void) * with suspending. */ - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); local_irq_disab
[PATCH 2/2] KVM: PPC: hypervisor large decrementer support
Power ISAv3 extends the width of the decrementer register from 32 bits. The enlarged register width is implementation dependent, but reads from these registers are automatically sign extended to produce a 64 bit output when operating in large mode. The HDEC always operates in large mode while the DEC register can be operated in 32bit mode or large mode depending on the setting of the LPCR.LD bit. Currently the hypervisor assumes that reads from the DEC and HDEC register produce a 32 bit result which it sign extends to 64 bits using the extsw instruction. This behaviour can result in the guest DEC register value being corrupted by the hypervisor when the guest is operating in LD mode since the results of the extsw instruction only depends on the value of bit 31 in the register to be sign extended. This patch adds the GET_DEC() and GET_HDEC() assembly macros for reading from the decrementer registers. These macros will return the current decrementer value as a 64 bit quantity regardless of the Host CPU or guest decrementer operating mode. Additionally this patch corrects several uses of decrementer values that assume a 32 bit register width. Signed-off-by: Oliver O'Halloran Cc: Paul Mackerras Cc: Michael Neuling --- arch/powerpc/include/asm/exception-64s.h | 29 arch/powerpc/include/asm/kvm_host.h | 2 +- arch/powerpc/include/asm/kvm_ppc.h | 2 +- arch/powerpc/include/uapi/asm/kvm.h | 2 +- arch/powerpc/kvm/book3s_hv_interrupts.S | 3 +-- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 38 ++-- arch/powerpc/kvm/emulate.c | 6 ++--- 7 files changed, 57 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 93ae809fe5ea..4fa303bf6d5b 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -545,4 +545,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) #define FINISH_NAP #endif +/* + * On ISAv3 processors the DEC register can be extended from 32 bits to 64 by + * setting the LD flag the LPCR. The decrementer value is a signed quantity so + * sign exension is required when operating in 32 bit mode. The GET_DEC() and + * GET_HDEC() handle this sign extension and yield a 64 bit result independent + * of the LD mode. + * + * NB: It's possible run with LD mode disabled on ISAv3 so GET_DEC() does not + * use a CPU_FEATURE section. A feature section is used for GET_HDEC because + * it has no mode bit. It is always 64 bits for ISAv3 processors. + */ + +#define IS_LD_ENABLED(reg) \ + mfspr reg,SPRN_LPCR; \ + andis. reg,reg,(LPCR_LD >> 16); + +#define GET_DEC(reg) \ + IS_LD_ENABLED(reg);\ + mfspr reg, SPRN_DEC; \ + bne 99f; \ + extsw reg, reg;\ +99: + +#define GET_HDEC(reg) \ + mfspr reg, SPRN_HDEC; \ +BEGIN_FTR_SECTION \ + extsw reg, reg; \ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + #endif /* _ASM_POWERPC_EXCEPTION_H */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index ec35af34a3fb..ddea233e2cce 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -520,7 +520,7 @@ struct kvm_vcpu_arch { ulong mcsrr0; ulong mcsrr1; ulong mcsr; - u32 dec; + u64 dec; #ifdef CONFIG_BOOKE u32 decar; #endif diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2544edabe7f3..4de0102930e9 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -94,7 +94,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run, extern int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu); extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); -extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); +extern u64 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); extern void kvmppc_decrementer_func(struct kvm_vcpu *vcpu); extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index c93cf35ce379..2dd92e841127 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -215,7 +215,7 @@ struct kvm_sregs { __u32 tsr; /* KVM_SREGS_E_UPDATE_TSR */ __u32 tcr; __u32 decar; - __u32 dec; /* KVM_SREGS_E_UPDATE_DEC */ + __u64 dec; /* KVM_SREGS_E_UPDATE_DEC */ /* * Users
[PATCH 1/2] powerpc/timer - large decrementer support
POWER ISA v3 adds large decrementer (LD) mode of operation which increases the size of the decrementer register from 32 bits to an implementation defined with of up to 64 bits. This patch adds support for the LD on processors with the CPU_FTR_ARCH_300 cpu feature flag set. For CPUs with this feature LD mode is enabled when when the ibm,dec-bits devicetree property is supplied for the boot CPU. The decrementer value is a signed quantity (with negative values indicating a pending exception) and this property is required to find the maximum positive decrementer value. If this property is not supplied then the traditional decrementer width of 32 bits is assumed and LD mode is disabled. This patch was based on initial work by Jack Miller. Signed-off-by: Oliver O'Halloran Cc: Michael Neuling Cc: Balbir Singh Cc: Jack Miller --- arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/include/asm/time.h | 6 +-- arch/powerpc/kernel/time.c | 94 + 3 files changed, 90 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index c1e82e968506..2793f3f03f9b 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -332,6 +332,7 @@ #define LPCR_AIL_0 0x /* MMU off exception offset 0x0 */ #define LPCR_AIL_3 0x0180 /* MMU on exception offset 0xc00...4xxx */ #define LPCR_ONL 0x0004 /* online - PURR/SPURR count */ +#define LPCR_LD 0x0002 /* large decremeter */ #define LPCR_PECE0x0001f000 /* powersave exit cause enable */ #define LPCR_PECEDP0x0001 /* directed priv dbells cause exit */ #define LPCR_PECEDH0x8000 /* directed hyp dbells cause exit */ diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 1092fdd7e737..09211640a0e0 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int lower) * in auto-reload mode. The problem is PIT stops counting when it * hits zero. If it would wrap, we could use it just like a decrementer. */ -static inline unsigned int get_dec(void) +static inline u64 get_dec(void) { #if defined(CONFIG_40x) return (mfspr(SPRN_PIT)); @@ -160,10 +160,10 @@ static inline unsigned int get_dec(void) * in when the decrementer generates its interrupt: on the 1 to 0 * transition for Book E/4xx, but on the 0 to -1 transition for others. */ -static inline void set_dec(int val) +static inline void set_dec(u64 val) { #if defined(CONFIG_40x) - mtspr(SPRN_PIT, val); + mtspr(SPRN_PIT, (u32) val); #else #ifndef CONFIG_BOOKE --val; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 3ed9a5a21d77..fe66f1c8d8b2 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -96,7 +96,8 @@ static struct clocksource clocksource_timebase = { .read = timebase_read, }; -#define DECREMENTER_MAX0x7fff +#define DECREMENTER_DEFAULT_MAX 0x7FFF +u64 decrementer_max = DECREMENTER_DEFAULT_MAX; static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev); @@ -504,8 +505,8 @@ static void __timer_interrupt(void) __this_cpu_inc(irq_stat.timer_irqs_event); } else { now = *next_tb - now; - if (now <= DECREMENTER_MAX) - set_dec((int)now); + if (now <= decrementer_max) + set_dec(now); /* We may have raced with new irq work */ if (test_irq_work_pending()) set_dec(1); @@ -535,7 +536,7 @@ void timer_interrupt(struct pt_regs * regs) /* Ensure a positive value is written to the decrementer, or else * some CPUs will continue to take decrementer exceptions. */ - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); /* Some implementations of hotplug will get timer interrupts while * offline, just ignore these and we also need to set @@ -583,9 +584,9 @@ static void generic_suspend_disable_irqs(void) * with suspending. */ - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); local_irq_disable(); - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); } static void generic_suspend_enable_irqs(void) @@ -866,7 +867,7 @@ static int decrementer_set_next_event(unsigned long evt, static int decrementer_shutdown(struct clock_event_device *dev) { - decrementer_set_next_event(DECREMENTER_MAX, dev); + decrementer_set_next_event(decrementer_max, dev); return 0; } @@ -892,6 +893,76 @@ static void register_decrementer_clockevent(int cpu) clockevents_register_device(dec);
[PATCH v2] powerpc/mm: Ensure "special" zones are empty
The mm zone mechanism was traditionally used by arch specific code to partition memory into allocation zones. However there are several zones that are managed by the mm subsystem rather than the architecture. Most architectures set the max PFN of these special zones to zero, however on powerpc we set them to ~0ul. This, in conjunction with a bug in free_area_init_nodes() results in all of system memory being placed in ZONE_DEVICE when enabled. Device memory cannot be used for regular kernel memory allocations so this will cause a kernel panic at boot. Given the planned addition of more mm managed zones (ZONE_CMA) we should aim to be consistent with every other architecture and set the max PFN for these zones to zero. Signed-off-by: Oliver O'Halloran Reviewed-by: Balbir Singh Cc: linux...@kvack.org --- arch/powerpc/mm/mem.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 879e0bc6f82e..f35e6605c422 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -239,8 +239,14 @@ static int __init mark_nonram_nosave(void) static bool zone_limits_final; +/* + * The memory zones past TOP_ZONE are managed by generic mm code. + * These should be set to zero since that's what every other + * architecture does. + */ static unsigned long max_zone_pfns[MAX_NR_ZONES] = { - [0 ... MAX_NR_ZONES - 1] = ~0UL + [0... TOP_ZONE] = ~0UL, + [TOP_ZONE + 1 ... MAX_NR_ZONES - 1] = 0 }; /* -- 2.5.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 2/2] KVM: PPC: hypervisor large decrementer support
Power ISAv3 extends the width of the decrementer register from 32 bits. The enlarged register width is implementation dependent, but reads from these registers are automatically sign extended to produce a 64 bit output when operating in large mode. The HDEC always operates in large mode while the DEC register can be operated in 32bit mode or large mode depending on the setting of the LPCR.LD bit. Currently the hypervisor assumes that reads from the DEC and HDEC register produce a 32 bit result which it sign extends to 64 bits using the extsw instruction. This behaviour can result in the guest DEC register value being corrupted by the hypervisor when the guest is operating in LD mode since the results of the extsw instruction only depends on the value of bit 31 in the register to be sign extended. This patch adds the GET_DEC() and GET_HDEC() assembly macros for reading from the decrementer registers. These macros will return the current decrementer value as a 64 bit quantity regardless of the Host CPU or guest decrementer operating mode. Additionally this patch corrects several uses of decrementer values that assume a 32 bit register width. Signed-off-by: Oliver O'Halloran Cc: Paul Mackerras Cc: Michael Neuling --- arch/powerpc/include/asm/exception-64s.h | 29 arch/powerpc/include/asm/kvm_host.h | 2 +- arch/powerpc/include/asm/kvm_ppc.h | 2 +- arch/powerpc/include/uapi/asm/kvm.h | 2 +- arch/powerpc/kernel/time.c | 2 +- arch/powerpc/kvm/book3s_hv_interrupts.S | 3 +-- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 38 ++-- arch/powerpc/kvm/emulate.c | 6 ++--- 8 files changed, 58 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 93ae809fe5ea..4fa303bf6d5b 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -545,4 +545,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) #define FINISH_NAP #endif +/* + * On ISAv3 processors the DEC register can be extended from 32 bits to 64 by + * setting the LD flag the LPCR. The decrementer value is a signed quantity so + * sign exension is required when operating in 32 bit mode. The GET_DEC() and + * GET_HDEC() handle this sign extension and yield a 64 bit result independent + * of the LD mode. + * + * NB: It's possible run with LD mode disabled on ISAv3 so GET_DEC() does not + * use a CPU_FEATURE section. A feature section is used for GET_HDEC because + * it has no mode bit. It is always 64 bits for ISAv3 processors. + */ + +#define IS_LD_ENABLED(reg) \ + mfspr reg,SPRN_LPCR; \ + andis. reg,reg,(LPCR_LD >> 16); + +#define GET_DEC(reg) \ + IS_LD_ENABLED(reg);\ + mfspr reg, SPRN_DEC; \ + bne 99f; \ + extsw reg, reg;\ +99: + +#define GET_HDEC(reg) \ + mfspr reg, SPRN_HDEC; \ +BEGIN_FTR_SECTION \ + extsw reg, reg; \ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + #endif /* _ASM_POWERPC_EXCEPTION_H */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d7b343170453..6330d3fca083 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -516,7 +516,7 @@ struct kvm_vcpu_arch { ulong mcsrr0; ulong mcsrr1; ulong mcsr; - u32 dec; + u64 dec; #ifdef CONFIG_BOOKE u32 decar; #endif diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2544edabe7f3..4de0102930e9 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -94,7 +94,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run, extern int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu); extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); -extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); +extern u64 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); extern void kvmppc_decrementer_func(struct kvm_vcpu *vcpu); extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index c93cf35ce379..2dd92e841127 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -215,7 +215,7 @@ struct kvm_sregs { __u32 tsr; /* KVM_SREGS_E_UPDATE_TSR */ __u32 tcr; __u32 decar; - __u32 dec; /* KVM_SREGS_E_UPDATE_DEC */ + __u64 dec; /* KVM_SR
[PATCH v3 1/2] powerpc/timer - large decrementer support
POWER ISA v3 adds large decrementer (LD) mode of operation which increases the size of the decrementer register from 32 bits to an implementation defined with of up to 64 bits. This patch adds support for the LD on processors with the CPU_FTR_ARCH_300 cpu feature flag set. For CPUs with this feature LD mode is enabled when when the ibm,dec-bits devicetree property is supplied for the boot CPU. The decrementer value is a signed quantity (with negative values indicating a pending exception) and this property is required to find the maximum positive decrementer value. If this property is not supplied then the traditional decrementer width of 32 bits is assumed and LD mode is disabled. This patch was based on initial work by Jack Miller. Signed-off-by: Oliver O'Halloran Cc: Michael Neuling Cc: Balbir Singh Cc: Jack Miller --- arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/include/asm/time.h | 6 +-- arch/powerpc/kernel/time.c | 92 + 3 files changed, 89 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index f5f4c66bbbc9..ff581ed1ab9d 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -332,6 +332,7 @@ #define LPCR_AIL_0 0x /* MMU off exception offset 0x0 */ #define LPCR_AIL_3 0x0180 /* MMU on exception offset 0xc00...4xxx */ #define LPCR_ONL 0x0004 /* online - PURR/SPURR count */ +#define LPCR_LD 0x0002 /* large decremeter */ #define LPCR_PECE0x0001f000 /* powersave exit cause enable */ #define LPCR_PECEDP0x0001 /* directed priv dbells cause exit */ #define LPCR_PECEDH0x8000 /* directed hyp dbells cause exit */ diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 1092fdd7e737..09211640a0e0 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int lower) * in auto-reload mode. The problem is PIT stops counting when it * hits zero. If it would wrap, we could use it just like a decrementer. */ -static inline unsigned int get_dec(void) +static inline u64 get_dec(void) { #if defined(CONFIG_40x) return (mfspr(SPRN_PIT)); @@ -160,10 +160,10 @@ static inline unsigned int get_dec(void) * in when the decrementer generates its interrupt: on the 1 to 0 * transition for Book E/4xx, but on the 0 to -1 transition for others. */ -static inline void set_dec(int val) +static inline void set_dec(u64 val) { #if defined(CONFIG_40x) - mtspr(SPRN_PIT, val); + mtspr(SPRN_PIT, (u32) val); #else #ifndef CONFIG_BOOKE --val; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 81b0900a39ee..0656e80cadbf 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -95,7 +95,8 @@ static struct clocksource clocksource_timebase = { .read = timebase_read, }; -#define DECREMENTER_MAX0x7fff +#define DECREMENTER_DEFAULT_MAX 0x7FFF +u64 decrementer_max = DECREMENTER_DEFAULT_MAX; static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev); @@ -503,7 +504,7 @@ static void __timer_interrupt(void) __this_cpu_inc(irq_stat.timer_irqs_event); } else { now = *next_tb - now; - if (now <= DECREMENTER_MAX) + if (now <= decrementer_max) set_dec((int)now); /* We may have raced with new irq work */ if (test_irq_work_pending()) @@ -534,7 +535,7 @@ void timer_interrupt(struct pt_regs * regs) /* Ensure a positive value is written to the decrementer, or else * some CPUs will continue to take decrementer exceptions. */ - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); /* Some implementations of hotplug will get timer interrupts while * offline, just ignore these and we also need to set @@ -582,9 +583,9 @@ static void generic_suspend_disable_irqs(void) * with suspending. */ - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); local_irq_disable(); - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); } static void generic_suspend_enable_irqs(void) @@ -865,7 +866,7 @@ static int decrementer_set_next_event(unsigned long evt, static int decrementer_shutdown(struct clock_event_device *dev) { - decrementer_set_next_event(DECREMENTER_MAX, dev); + decrementer_set_next_event(decrementer_max, dev); return 0; } @@ -891,6 +892,76 @@ static void register_decrementer_clockevent(int cpu) clockevents_register_device(dec); } +static inline bool cpu_has_large_dec(void) +{ + return cpu_
[RFC PATCH] mm/init: fix zone boundary creation
As a part of memory initialisation the architecture passes an array to free_area_init_nodes() which specifies the max PFN of each memory zone. This array is not necessarily monotonic (due to unused zones) so this array is parsed to build monotonic lists of the min and max PFN for each zone. ZONE_MOVABLE is special cased here as its limits are managed by the mm subsystem rather than the architecture. Unfortunately, this special casing is broken when ZONE_MOVABLE is the not the last zone in the zone list. The core of the issue is: if (i == ZONE_MOVABLE) continue; arch_zone_lowest_possible_pfn[i] = arch_zone_highest_possible_pfn[i-1]; As ZONE_MOVABLE is skipped the lowest_possible_pfn of the next zone will be set to zero. This patch fixes this bug by adding explicitly tracking where the next zone should start rather than relying on the contents arch_zone_highest_possible_pfn[]. Signed-off-by: Oliver O'Halloran Cc: linuxppc-dev@lists.ozlabs.org --- mm/page_alloc.c | 17 ++--- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 59de90d5d3a3..fc78306ce087 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5980,15 +5980,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) sizeof(arch_zone_lowest_possible_pfn)); memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn)); - arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); - arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; - for (i = 1; i < MAX_NR_ZONES; i++) { + + start_pfn = find_min_pfn_with_active_regions(); + + for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - arch_zone_lowest_possible_pfn[i] = - arch_zone_highest_possible_pfn[i-1]; - arch_zone_highest_possible_pfn[i] = - max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); + + end_pfn = max(max_zone_pfn[i], start_pfn); + arch_zone_lowest_possible_pfn[i] = start_pfn; + arch_zone_highest_possible_pfn[i] = end_pfn; + + start_pfn = end_pfn; } arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; -- 2.5.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 2/2] powerpc/mm: Ensure "special" zones are empty
The mm zone mechanism was traditionally used by arch specific code to partition memory into allocation zones. However there are several zones that are managed by the mm subsystem rather than the architecture. Most architectures set the max PFN of these special zones to zero, however on powerpc we set them to ~0ul. This, in conjunction with a bug in free_area_init_nodes() results in all of system memory being placed in ZONE_DEVICE when enabled. Device memory cannot be used for regular kernel memory allocations so this will cause a kernel panic at boot. Given the planned addition of more mm managed zones (ZONE_CMA) we should aim to be consistent with every other architecture and set the max PFN for these zones to zero. Signed-off-by: Oliver O'Halloran Cc: linux...@kvack.org --- arch/powerpc/mm/mem.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 8f4c19789a38..f0a058ebb6d7 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -239,8 +239,14 @@ static int __init mark_nonram_nosave(void) static bool zone_limits_final; +/* + * The memory zones past TOP_ZONE are managed by generic mm code. + * These should be set to zero since that's what every other + * architecture does. + */ static unsigned long max_zone_pfns[MAX_NR_ZONES] = { - [0 ... MAX_NR_ZONES - 1] = ~0UL + [0... TOP_ZONE - 1] = ~0UL, + [TOP_ZONE ... MAX_NR_ZONES - 1] = 0 }; /* -- 2.5.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 1/2] powerpc/mm: define TOP_ZONE as a constant
The zone that contains the top of memory will be either ZONE_NORMAL or ZONE_HIGHMEM depending on the kernel config. There are two functions that require this information and both of them use an #ifdef to set a local variable (top_zone). This is a little silly so lets just make it a constant. Signed-off-by: Oliver O'Halloran Cc: linux...@kvack.org --- arch/powerpc/mm/mem.c | 17 + 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index ac79dbde1015..8f4c19789a38 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -68,12 +68,15 @@ pte_t *kmap_pte; EXPORT_SYMBOL(kmap_pte); pgprot_t kmap_prot; EXPORT_SYMBOL(kmap_prot); +#define TOP_ZONE ZONE_HIGHMEM static inline pte_t *virt_to_kpte(unsigned long vaddr) { return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), vaddr), vaddr); } +#else +#define TOP_ZONE ZONE_NORMAL #endif int page_is_ram(unsigned long pfn) @@ -267,14 +270,9 @@ void __init limit_zone_pfn(enum zone_type zone, unsigned long pfn_limit) */ int dma_pfn_limit_to_zone(u64 pfn_limit) { - enum zone_type top_zone = ZONE_NORMAL; int i; -#ifdef CONFIG_HIGHMEM - top_zone = ZONE_HIGHMEM; -#endif - - for (i = top_zone; i >= 0; i--) { + for (i = TOP_ZONE; i >= 0; i--) { if (max_zone_pfns[i] <= pfn_limit) return i; } @@ -289,7 +287,6 @@ void __init paging_init(void) { unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); - enum zone_type top_zone; #ifdef CONFIG_PPC32 unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1); @@ -313,13 +310,9 @@ void __init paging_init(void) (long int)((top_of_ram - total_ram) >> 20)); #ifdef CONFIG_HIGHMEM - top_zone = ZONE_HIGHMEM; limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT); -#else - top_zone = ZONE_NORMAL; #endif - - limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT); + limit_zone_pfn(TOP_ZONE, top_of_ram >> PAGE_SHIFT); zone_limits_final = true; free_area_init_nodes(max_zone_pfns); -- 2.5.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 2/2] powerpc/mm: Ensure "special" zones are empty
The mm zone mechanism was traditionally used by arch specific code to partition memory into allocation zones. However there are several zones that are managed by the mm subsystem rather than the architecture. Most architectures set the max PFN of these special zones to zero, however on powerpc we set them to ~0ul. This, in conjunction with a bug in free_area_init_nodes() results in all of system memory being placed being placed in ZONE_DEVICE when enabled. Device memory cannot be used for regular kernel memory allocations so this will cause a kernel panic at boot. Given the planned addition of more mm managed zones (ZONE_CMA) we should aim to be consistent with every other architecture and set the max PFN for these zones to zero Signed-off-by: Oliver O'Halloran --- arch/powerpc/mm/mem.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 8f4c19789a38..f0a058ebb6d7 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -239,8 +239,14 @@ static int __init mark_nonram_nosave(void) static bool zone_limits_final; +/* + * The memory zones past TOP_ZONE are managed by the generic + * mm subsystem which expects the max PFN for these zones + * to be set to zero. + */ static unsigned long max_zone_pfns[MAX_NR_ZONES] = { - [0 ... MAX_NR_ZONES - 1] = ~0UL + [0... TOP_ZONE - 1] = ~0UL, + [TOP_ZONE ... MAX_NR_ZONES - 1] = 0 }; /* -- 2.5.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 1/2] powerpc/mm: define TOP_ZONE as a constant
The zone that contains the top of memory will be either ZONE_NORMAL or ZONE_HIGHMEM depending on the kernel config. There are two functions in that require this information and both of them use an #ifdef to set a local variable (top_zone). This is a little silly so lets just make it a constant. Signed-off-by: Oliver O'Halloran --- arch/powerpc/mm/mem.c | 17 + 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index ac79dbde1015..8f4c19789a38 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -68,12 +68,15 @@ pte_t *kmap_pte; EXPORT_SYMBOL(kmap_pte); pgprot_t kmap_prot; EXPORT_SYMBOL(kmap_prot); +#define TOP_ZONE ZONE_HIGHMEM static inline pte_t *virt_to_kpte(unsigned long vaddr) { return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), vaddr), vaddr); } +#else +#define TOP_ZONE ZONE_NORMAL #endif int page_is_ram(unsigned long pfn) @@ -267,14 +270,9 @@ void __init limit_zone_pfn(enum zone_type zone, unsigned long pfn_limit) */ int dma_pfn_limit_to_zone(u64 pfn_limit) { - enum zone_type top_zone = ZONE_NORMAL; int i; -#ifdef CONFIG_HIGHMEM - top_zone = ZONE_HIGHMEM; -#endif - - for (i = top_zone; i >= 0; i--) { + for (i = TOP_ZONE; i >= 0; i--) { if (max_zone_pfns[i] <= pfn_limit) return i; } @@ -289,7 +287,6 @@ void __init paging_init(void) { unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); - enum zone_type top_zone; #ifdef CONFIG_PPC32 unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1); @@ -313,13 +310,9 @@ void __init paging_init(void) (long int)((top_of_ram - total_ram) >> 20)); #ifdef CONFIG_HIGHMEM - top_zone = ZONE_HIGHMEM; limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT); -#else - top_zone = ZONE_NORMAL; #endif - - limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT); + limit_zone_pfn(TOP_ZONE, top_of_ram >> PAGE_SHIFT); zone_limits_final = true; free_area_init_nodes(max_zone_pfns); -- 2.5.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 2/2] KVM: PPC: hypervisor large decrementer support
Power ISAv3 extends the width of the decrementer register from 32 bits. The enlarged register width is implementation dependent, but reads from these registers are automatically sign extended to produce a 64 bit output when operating in large mode. The HDEC always operates in large mode while the DEC register can be operated in 32bit mode or large mode depending on the setting of the LPCR.LD bit. Currently the hypervisor assumes that reads from the DEC and HDEC register produce a 32 bit result which it sign extends to 64 bits using the extsw instruction. This behaviour can result in the guest DEC register value being corrupted by the hypervisor when the guest is operating in LD mode since the results of the extsw instruction only depends on the value of bit 31 in the register to be sign extended. This patch adds the GET_DEC() and GET_HDEC() assembly macros for reading from the decrementer registers. These macros will return the current decrementer value as a 64 bit quantity regardless of the Host CPU or guest decrementer operating mode. Additionally this patch corrects several uses of decrementer values that assume a 32 bit register width. Signed-off-by: Oliver O'Halloran Cc: Paul Mackerras Cc: Balbir Singh --- arch/powerpc/include/asm/exception-64s.h | 29 arch/powerpc/include/asm/kvm_host.h | 2 +- arch/powerpc/include/asm/kvm_ppc.h | 2 +- arch/powerpc/include/uapi/asm/kvm.h | 2 +- arch/powerpc/kernel/time.c | 2 +- arch/powerpc/kvm/book3s_hv_interrupts.S | 3 +-- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 38 ++-- arch/powerpc/kvm/emulate.c | 6 ++--- 8 files changed, 58 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 93ae809fe5ea..4fa303bf6d5b 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -545,4 +545,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) #define FINISH_NAP #endif +/* + * On ISAv3 processors the DEC register can be extended from 32 bits to 64 by + * setting the LD flag the LPCR. The decrementer value is a signed quantity so + * sign exension is required when operating in 32 bit mode. The GET_DEC() and + * GET_HDEC() handle this sign extension and yield a 64 bit result independent + * of the LD mode. + * + * NB: It's possible run with LD mode disabled on ISAv3 so GET_DEC() does not + * use a CPU_FEATURE section. A feature section is used for GET_HDEC because + * it has no mode bit. It is always 64 bits for ISAv3 processors. + */ + +#define IS_LD_ENABLED(reg) \ + mfspr reg,SPRN_LPCR; \ + andis. reg,reg,(LPCR_LD >> 16); + +#define GET_DEC(reg) \ + IS_LD_ENABLED(reg);\ + mfspr reg, SPRN_DEC; \ + bne 99f; \ + extsw reg, reg;\ +99: + +#define GET_HDEC(reg) \ + mfspr reg, SPRN_HDEC; \ +BEGIN_FTR_SECTION \ + extsw reg, reg; \ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + #endif /* _ASM_POWERPC_EXCEPTION_H */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d7b343170453..6330d3fca083 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -516,7 +516,7 @@ struct kvm_vcpu_arch { ulong mcsrr0; ulong mcsrr1; ulong mcsr; - u32 dec; + u64 dec; #ifdef CONFIG_BOOKE u32 decar; #endif diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2544edabe7f3..4de0102930e9 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -94,7 +94,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run, extern int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu); extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); -extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); +extern u64 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); extern void kvmppc_decrementer_func(struct kvm_vcpu *vcpu); extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index c93cf35ce379..2dd92e841127 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -215,7 +215,7 @@ struct kvm_sregs { __u32 tsr; /* KVM_SREGS_E_UPDATE_TSR */ __u32 tcr; __u32 decar; - __u32 dec; /* KVM_SREGS_E_UPDATE_DEC */ + __u64 dec; /* KVM_SR
[PATCH v2 1/2] powerpc/timer - large decrementer support
POWER ISA v3 adds large decrementer (LD) mode of operation which increases the size of the decrementer register from 32 bits to an implementation defined with of up to 64 bits. This patch adds support for the LD on processors with the CPU_FTR_ARCH_300 cpu feature flag set. Even for CPUs with this feature LD mode is only enabled when the property ibm,dec-bits devicetree property is supplied for the boot CPU. The decrementer value is a signed quantity (with negative values indicating a pending exception) and this property is required to find the maximum positive decrementer value. If this property is not supplied then the traditional decrementer width of 32 bits is assumed and LD mode is disabled. This patch was based on initial work by Jack Miller. Signed-off-by: Oliver O'Halloran Cc: Jack Miller Cc: Balbir Singh --- arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/include/asm/time.h | 6 +-- arch/powerpc/kernel/time.c | 89 + 3 files changed, 86 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index f5f4c66bbbc9..ff581ed1ab9d 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -332,6 +332,7 @@ #define LPCR_AIL_0 0x /* MMU off exception offset 0x0 */ #define LPCR_AIL_3 0x0180 /* MMU on exception offset 0xc00...4xxx */ #define LPCR_ONL 0x0004 /* online - PURR/SPURR count */ +#define LPCR_LD 0x0002 /* large decremeter */ #define LPCR_PECE0x0001f000 /* powersave exit cause enable */ #define LPCR_PECEDP0x0001 /* directed priv dbells cause exit */ #define LPCR_PECEDH0x8000 /* directed hyp dbells cause exit */ diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 1092fdd7e737..09211640a0e0 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int lower) * in auto-reload mode. The problem is PIT stops counting when it * hits zero. If it would wrap, we could use it just like a decrementer. */ -static inline unsigned int get_dec(void) +static inline u64 get_dec(void) { #if defined(CONFIG_40x) return (mfspr(SPRN_PIT)); @@ -160,10 +160,10 @@ static inline unsigned int get_dec(void) * in when the decrementer generates its interrupt: on the 1 to 0 * transition for Book E/4xx, but on the 0 to -1 transition for others. */ -static inline void set_dec(int val) +static inline void set_dec(u64 val) { #if defined(CONFIG_40x) - mtspr(SPRN_PIT, val); + mtspr(SPRN_PIT, (u32) val); #else #ifndef CONFIG_BOOKE --val; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 81b0900a39ee..fab34abfb4cd 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -95,7 +95,8 @@ static struct clocksource clocksource_timebase = { .read = timebase_read, }; -#define DECREMENTER_MAX0x7fff +#define DECREMENTER_DEFAULT_MAX 0x7FFF +u64 decrementer_max = DECREMENTER_DEFAULT_MAX; static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev); @@ -503,7 +504,7 @@ static void __timer_interrupt(void) __this_cpu_inc(irq_stat.timer_irqs_event); } else { now = *next_tb - now; - if (now <= DECREMENTER_MAX) + if (now <= decrementer_max) set_dec((int)now); /* We may have raced with new irq work */ if (test_irq_work_pending()) @@ -534,7 +535,7 @@ void timer_interrupt(struct pt_regs * regs) /* Ensure a positive value is written to the decrementer, or else * some CPUs will continue to take decrementer exceptions. */ - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); /* Some implementations of hotplug will get timer interrupts while * offline, just ignore these and we also need to set @@ -582,9 +583,9 @@ static void generic_suspend_disable_irqs(void) * with suspending. */ - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); local_irq_disable(); - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); } static void generic_suspend_enable_irqs(void) @@ -865,7 +866,7 @@ static int decrementer_set_next_event(unsigned long evt, static int decrementer_shutdown(struct clock_event_device *dev) { - decrementer_set_next_event(DECREMENTER_MAX, dev); + decrementer_set_next_event(decrementer_max, dev); return 0; } @@ -891,6 +892,73 @@ static void register_decrementer_clockevent(int cpu) clockevents_register_device(dec); } +static inline bool large_dec_supp(void) +{ + return cpu_has_feature(CPU_F
[PATCH 2/2] KVM: PPC: hypervisor large decrementer support
Power ISAv3 extends the width of the decrementer register from 32 bits. The enlarged register width is implementation dependent, but reads from these registers are automatically sign extended to produce a 64 bit output when operating in large mode. The HDEC always operates in large mode while the DEC register can be operated in 32bit mode or large mode depending on the setting of the LPCR.LD bit. Currently the hypervisor assumes that reads from the DEC and HDEC register produce a 32 bit result which it sign extends to 64 bits using the extsw instruction. This behaviour can result in the guest DEC register value being corrupted by the hypervisor when the guest is operating in LD mode since the results of the extsw instruction only depends on the value of bit 31 in the register to be sign extended. This patch adds the GET_DEC() and GET_HDEC() assembly macros for reading from the decrementer registers. These macros will return the current decrementer value as a 64 bit quantity regardless of the Host CPU or guest decrementer operating mode. Additionally this patch corrects several uses of decrementer values that assume a 32 bit register width. Signed-off-by: Oliver O'Halloran Cc: Paul Mackerras --- arch/powerpc/include/asm/exception-64s.h | 22 ++ arch/powerpc/include/asm/kvm_host.h | 2 +- arch/powerpc/include/asm/kvm_ppc.h | 2 +- arch/powerpc/include/uapi/asm/kvm.h | 2 +- arch/powerpc/kernel/exceptions-64s.S | 9 +++- arch/powerpc/kvm/book3s_hv_interrupts.S | 3 +-- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 38 ++-- arch/powerpc/kvm/emulate.c | 4 ++-- 8 files changed, 57 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 93ae809fe5ea..d922f76c682d 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -545,4 +545,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) #define FINISH_NAP #endif +/* these ensure that we always get a 64bit value from the + * decrementer register. */ + +#define IS_LD_ENABLED(reg) \ + mfspr reg,SPRN_LPCR; \ + andis. reg,reg,(LPCR_LD >> 16); + +#define GET_DEC(reg) \ + IS_LD_ENABLED(reg);\ + mfspr reg, SPRN_DEC; \ + bne 99f; \ + extsw reg, reg;\ +99: + +/* For CPUs that support it the Hypervisor LD is + * always enabled, so this needs to be feature gated */ +#define GET_HDEC(reg) \ + mfspr reg, SPRN_HDEC; \ +BEGIN_FTR_SECTION \ + extsw reg, reg; \ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + #endif /* _ASM_POWERPC_EXCEPTION_H */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d7b343170453..6330d3fca083 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -516,7 +516,7 @@ struct kvm_vcpu_arch { ulong mcsrr0; ulong mcsrr1; ulong mcsr; - u32 dec; + u64 dec; #ifdef CONFIG_BOOKE u32 decar; #endif diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2544edabe7f3..4de0102930e9 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -94,7 +94,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run, extern int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu); extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); -extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); +extern u64 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); extern void kvmppc_decrementer_func(struct kvm_vcpu *vcpu); extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index c93cf35ce379..2dd92e841127 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -215,7 +215,7 @@ struct kvm_sregs { __u32 tsr; /* KVM_SREGS_E_UPDATE_TSR */ __u32 tcr; __u32 decar; - __u32 dec; /* KVM_SREGS_E_UPDATE_DEC */ + __u64 dec; /* KVM_SREGS_E_UPDATE_DEC */ /* * Userspace can read TB directly, but the diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7716cebf4b8e..984ae894e758 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -641,7 +641,14 @@ masked_##_H##interrupt: \ stb r11,PACAIRQH
[PATCH 1/2] powerpc/timer - large decrementer support
POWER ISA v3 adds large decrementer (LD) mode of operation which increases the size of the decrementer register from 32 bits to an implementation defined with of up to 64 bits. This patch adds support for the LD on processors with the CPU_FTR_ARCH_300 cpu feature flag set. Even for CPUs with this feature LD mode is only enabled when the property ibm,dec-bits devicetree property is supplied for the boot CPU. The decrementer value is a signed quantity (with negative values indicating a pending exception) and this property is required to find the maximum positive decrementer value. If this property is not supplied then the traditional decrementer width of 32 bits is assumed and LD mode is disabled. This patch was based on inital work by Jack Miller. Signed-off-by: Oliver O'Halloran Cc: Jack Miller --- arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/include/asm/time.h | 6 +-- arch/powerpc/kernel/time.c | 89 + 3 files changed, 86 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index f5f4c66bbbc9..ff581ed1ab9d 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -332,6 +332,7 @@ #define LPCR_AIL_0 0x /* MMU off exception offset 0x0 */ #define LPCR_AIL_3 0x0180 /* MMU on exception offset 0xc00...4xxx */ #define LPCR_ONL 0x0004 /* online - PURR/SPURR count */ +#define LPCR_LD 0x0002 /* large decremeter */ #define LPCR_PECE0x0001f000 /* powersave exit cause enable */ #define LPCR_PECEDP0x0001 /* directed priv dbells cause exit */ #define LPCR_PECEDH0x8000 /* directed hyp dbells cause exit */ diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 1092fdd7e737..09211640a0e0 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int lower) * in auto-reload mode. The problem is PIT stops counting when it * hits zero. If it would wrap, we could use it just like a decrementer. */ -static inline unsigned int get_dec(void) +static inline u64 get_dec(void) { #if defined(CONFIG_40x) return (mfspr(SPRN_PIT)); @@ -160,10 +160,10 @@ static inline unsigned int get_dec(void) * in when the decrementer generates its interrupt: on the 1 to 0 * transition for Book E/4xx, but on the 0 to -1 transition for others. */ -static inline void set_dec(int val) +static inline void set_dec(u64 val) { #if defined(CONFIG_40x) - mtspr(SPRN_PIT, val); + mtspr(SPRN_PIT, (u32) val); #else #ifndef CONFIG_BOOKE --val; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 81b0900a39ee..0afaef6b5b6a 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -95,7 +95,8 @@ static struct clocksource clocksource_timebase = { .read = timebase_read, }; -#define DECREMENTER_MAX0x7fff +#define DECREMENTER_DEFAULT_MAX 0x7FFF +u64 decrementer_max = DECREMENTER_DEFAULT_MAX; static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev); @@ -503,7 +504,7 @@ static void __timer_interrupt(void) __this_cpu_inc(irq_stat.timer_irqs_event); } else { now = *next_tb - now; - if (now <= DECREMENTER_MAX) + if (now <= decrementer_max) set_dec((int)now); /* We may have raced with new irq work */ if (test_irq_work_pending()) @@ -534,7 +535,7 @@ void timer_interrupt(struct pt_regs * regs) /* Ensure a positive value is written to the decrementer, or else * some CPUs will continue to take decrementer exceptions. */ - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); /* Some implementations of hotplug will get timer interrupts while * offline, just ignore these and we also need to set @@ -562,6 +563,7 @@ void timer_interrupt(struct pt_regs * regs) irq_enter(); __timer_interrupt(); + irq_exit(); set_irq_regs(old_regs); } @@ -582,9 +584,9 @@ static void generic_suspend_disable_irqs(void) * with suspending. */ - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); local_irq_disable(); - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); } static void generic_suspend_enable_irqs(void) @@ -865,7 +867,7 @@ static int decrementer_set_next_event(unsigned long evt, static int decrementer_shutdown(struct clock_event_device *dev) { - decrementer_set_next_event(DECREMENTER_MAX, dev); + decrementer_set_next_event(decrementer_max, dev); return 0; } @@ -891,6 +893,72 @@
[PATCH] powerpc/process: fix altivec SPR not being saved
In save_sprs() in process.c contains the following test: if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC))) t->vrsave = mfspr(SPRN_VRSAVE); CPU feature with the mask 0x1 is CPU_FTR_COHERENT_ICACHE so the test is equivilent to: if (cpu_has_feature(CPU_FTR_ALTIVEC) && cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) On CPUs without support for both (i.e G5) this results in vrsave not being saved between context switches. The vector register save/restore code doesn't use VRSAVE to determine which registers to save/restore, but the value of VRSAVE is used to determine if altivec is being used in several code paths. Signed-off-by: Oliver O'Halloran Signed-off-by: Anton Blanchard Fixes: 152d523e6307 ("powerpc: Create context switch helpers save_sprs() and restore_sprs()") Cc: sta...@vger.kernel.org --- arch/powerpc/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index dccc87e8fee5..bc6aa87a3b12 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -854,7 +854,7 @@ void restore_tm_state(struct pt_regs *regs) static inline void save_sprs(struct thread_struct *t) { #ifdef CONFIG_ALTIVEC - if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC))) + if (cpu_has_feature(CPU_FTR_ALTIVEC)) t->vrsave = mfspr(SPRN_VRSAVE); #endif #ifdef CONFIG_PPC_BOOK3S_64 -- 2.5.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc/process: fix altivec SPR not being saved
In save_sprs() in process.c contains the following test: if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC))) t->vrsave = mfspr(SPRN_VRSAVE); CPU feature with the mask 0x1 is CPU_FTR_COHERENT_ICACHE so the test is equivilent to: if (cpu_has_feature(CPU_FTR_ALTIVEC) && cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) On CPUs without support for both (i.e G5) this results in vrsave not being saved between context switches. The vector register save/restore code doesn't use VRSAVE to determine which registers to save/restore, but the value of VRSAVE is used to determine if altivec is being used in several code paths. Signed-off-by: Oliver O'Halloran --- arch/powerpc/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 8224852..5a4d4d1 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -855,7 +855,7 @@ void restore_tm_state(struct pt_regs *regs) static inline void save_sprs(struct thread_struct *t) { #ifdef CONFIG_ALTIVEC - if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC))) + if (cpu_has_feature(CPU_FTR_ALTIVEC)) t->vrsave = mfspr(SPRN_VRSAVE); #endif #ifdef CONFIG_PPC_BOOK3S_64 -- 2.5.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2] powerpc/sstep.c - Fix emulation fall-through
There is a switch fallthough in instr_analyze() which can cause an invalid instruction to be emulated as a different, valid, instruction. The rld* (opcode 30) case extracts a sub-opcode from bits 3:1 of the instruction word. However, the only valid values of this field a 001 and 000. These cases are correctly handled, but the others are not which causes execution to fall through into case 31. Breaking out of the switch causes the instruction to be marked as unknown and allows the caller to deal with the invalid instruction in a manner consistent with other invalid instructions. Signed-off-by: Oliver O'Halloran --- arch/powerpc/lib/sstep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index dc885b3..e25f73c 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs, } } #endif + break; /* illegal instruction */ case 31: switch ((instr >> 1) & 0x3ff) { -- 2.5.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc/lib/sstep.c - Fix emulation fall-through
There is a switch fallthough in instr_analyze() which can cause an invalid instruction to be emulated as a different, valid, instruction. The rld* (opcode 30) case extracts a sub-opcode from bits 3:1 of the instruction word. However, the only valid values of this field a 001 and 000. These cases are correctly handled, but the others are not which causes execution to fall through into case 31. Breaking out of the switch causes the instruction to be marked as unknown and allows the caller to deal with the invalid instruction in a manner consistent with other invalid instructions. Signed-off-by: Oliver O'Halloran --- arch/powerpc/lib/sstep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index dc885b3..e25f73c 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs, } } #endif + break; /* illegal instruction */ case 31: switch ((instr >> 1) & 0x3ff) { -- 2.5.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] Fix fall-through from case 30 (rld*) to case 31
I think this bug can only be triggered if the instruction to simulate is malformed. The switch in the else case only handles the zero and one case, but it extracts bits 4:1 from the instruction word so it may be other values. It's pretty minor, but a bug is a bug. Signed-off-by: Oliver O'Halloran --- arch/powerpc/lib/sstep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index dc885b3..e25f73c 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs, } } #endif + break; /* illegal instruction */ case 31: switch ((instr >> 1) & 0x3ff) { -- 2.5.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev