From 39338bc28d81a9abbb51eef7b564125d4ec9cea6 Mon Sep 17 00:00:00 2001 From: Dan Aloni Date: Mon, 5 May 2014 17:58:32 +0300 Subject: [PATCH] Changes representative of linux-3.10.0-123.el7.tar.xz --- Makefile | 2 +- drivers/misc/mei/hw-me-regs.h | 5 + drivers/misc/mei/pci-me.c | 30 +++-- fs/autofs4/root.c | 4 +- mm/huge_memory.c | 53 +++++++-- mm/mlock.c | 2 + mm/mprotect.c | 40 +++++-- mm/rmap.c | 14 ++- security/device_cgroup.c | 200 ++++++++++++++++++++++++++-------- 9 files changed, 276 insertions(+), 74 deletions(-) diff --git a/Makefile b/Makefile index 90b6758143c29c..cb3ea13842670e 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ EXTRAVERSION = NAME = Unicycling Gorilla RHEL_MAJOR = 7 RHEL_MINOR = 0 -RHEL_RELEASE = 121 +RHEL_RELEASE = 123 # *DOCUMENTATION* # To see a list of typical targets execute "make help" diff --git a/drivers/misc/mei/hw-me-regs.h b/drivers/misc/mei/hw-me-regs.h index 66f411a6e8ea50..cabc04383685d4 100644 --- a/drivers/misc/mei/hw-me-regs.h +++ b/drivers/misc/mei/hw-me-regs.h @@ -115,6 +115,11 @@ #define MEI_DEV_ID_LPT_HR 0x8CBA /* Lynx Point H Refresh */ #define MEI_DEV_ID_WPT_LP 0x9CBA /* Wildcat Point LP */ + +/* Host Firmware Status Registers in PCI Config Space */ +#define PCI_CFG_HFS_1 0x40 +#define PCI_CFG_HFS_2 0x48 + /* * MEI HW Section */ diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index 371c65ae6be633..3c9e257982e3d1 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -105,15 +105,31 @@ static bool mei_me_quirk_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { u32 reg; - if (ent->device == MEI_DEV_ID_PBG_1) { - pci_read_config_dword(pdev, 0x48, ®); - /* make sure that bit 9 is up and bit 10 is down */ - if ((reg & 0x600) == 0x200) { - dev_info(&pdev->dev, "Device doesn't have valid ME Interface\n"); - return false; - } + /* Cougar Point || Patsburg */ + if (ent->device == MEI_DEV_ID_CPT_1 || + ent->device == MEI_DEV_ID_PBG_1) { + pci_read_config_dword(pdev, PCI_CFG_HFS_2, ®); + /* make sure that bit 9 (NM) is up and bit 10 (DM) is down */ + if ((reg & 0x600) == 0x200) + goto no_mei; } + + /* Lynx Point */ + if (ent->device == MEI_DEV_ID_LPT_H || + ent->device == MEI_DEV_ID_LPT_W || + ent->device == MEI_DEV_ID_LPT_HR) { + /* Read ME FW Status check for SPS Firmware */ + pci_read_config_dword(pdev, PCI_CFG_HFS_1, ®); + /* if bits [19:16] = 15, running SPS Firmware */ + if ((reg & 0xf0000) == 0xf0000) + goto no_mei; + } + return true; + +no_mei: + dev_info(&pdev->dev, "Device doesn't have valid ME Interface\n"); + return false; } /** * mei_probe - Device Initialization Routine diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index a0c8a1d5f8e11b..22bfeaf10e21dd 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) spin_lock(&active->d_lock); /* Already gone? */ - if (!d_count(active)) + if ((int) d_count(active) <= 0) goto next; qstr = &active->d_name; @@ -230,7 +230,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) spin_lock(&expiring->d_lock); - /* Bad luck, we've already been dentry_iput */ + /* We've already been dentry_iput or unlinked */ if (!expiring->d_inode) goto next; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d5bd72ae785fe9..8979b2d7e45345 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2255,7 +2255,34 @@ static void khugepaged_alloc_sleep(void) msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); } +static int khugepaged_node_load[MAX_NUMNODES]; + #ifdef CONFIG_NUMA +static int khugepaged_find_target_node(void) +{ + static int last_khugepaged_target_node = NUMA_NO_NODE; + int nid, target_node = 0, max_value = 0; + + /* find first node with max normal pages hit */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + if (khugepaged_node_load[nid] > max_value) { + max_value = khugepaged_node_load[nid]; + target_node = nid; + } + + /* do some balance if several nodes have the same hit record */ + if (target_node <= last_khugepaged_target_node) + for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; + nid++) + if (max_value == khugepaged_node_load[nid]) { + target_node = nid; + break; + } + + last_khugepaged_target_node = target_node; + return target_node; +} + static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) { if (IS_ERR(*hpage)) { @@ -2289,9 +2316,8 @@ static struct page * mmap_sem in read mode is good idea also to allow greater * scalability. */ - *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, - node, __GFP_OTHER_NODE); - + *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( + khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); /* * After allocating the hugepage, release the mmap_sem read lock in * preparation for taking it in write mode. @@ -2307,6 +2333,11 @@ static struct page return *hpage; } #else +static int khugepaged_find_target_node(void) +{ + return 0; +} + static struct page *khugepaged_alloc_hugepage(bool *wait) { struct page *hpage; @@ -2512,6 +2543,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (pmd_trans_huge(*pmd)) goto out; + memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { @@ -2528,12 +2560,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (unlikely(!page)) goto out_unmap; /* - * Chose the node of the first page. This could - * be more sophisticated and look at more pages, - * but isn't for now. + * Record which node the original page is from and save this + * information to khugepaged_node_load[]. + * Khupaged will allocate hugepage from the node has the max + * hit record. */ - if (node == NUMA_NO_NODE) - node = page_to_nid(page); + node = page_to_nid(page); + khugepaged_node_load[node]++; VM_BUG_ON(PageCompound(page)); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; @@ -2548,9 +2581,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, ret = 1; out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) + if (ret) { + node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ collapse_huge_page(mm, address, hpage, vma, node); + } out: return ret; } diff --git a/mm/mlock.c b/mm/mlock.c index 79b7cf7d1bca72..713e462c077637 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -76,6 +76,7 @@ void clear_page_mlock(struct page *page) */ void mlock_vma_page(struct page *page) { + /* Serialize with page migration */ BUG_ON(!PageLocked(page)); if (!TestSetPageMlocked(page)) { @@ -106,6 +107,7 @@ unsigned int munlock_vma_page(struct page *page) { unsigned int page_mask = 0; + /* For try_to_munlock() and to serialize with page migration */ BUG_ON(!PageLocked(page)); if (TestClearPageMlocked(page)) { diff --git a/mm/mprotect.c b/mm/mprotect.c index e92b1a5aa34058..a182e560297e7f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -36,6 +36,34 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) } #endif +/* + * For a prot_numa update we only hold mmap_sem for read so there is a + * potential race with faulting where a pmd was temporarily none. This + * function checks for a transhuge pmd under the appropriate lock. It + * returns a pte if it was successfully locked or NULL if it raced with + * a transhuge insertion. + */ +static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, int prot_numa, spinlock_t **ptl) +{ + pte_t *pte; + spinlock_t *pmdl; + + /* !prot_numa is protected by mmap_sem held for write */ + if (!prot_numa) + return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); + + pmdl = pmd_lock(vma->vm_mm, pmd); + if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) { + spin_unlock(pmdl); + return NULL; + } + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); + spin_unlock(pmdl); + return pte; +} + static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) @@ -45,17 +73,9 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; unsigned long pages = 0; - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - - /* - * For a prot_numa update we only hold mmap_sem for read so there is a - * potential race with faulting where a pmd was temporarily none so - * recheck it under the lock and bail if we race - */ - if (prot_numa && unlikely(pmd_trans_huge(*pmd))) { - pte_unmap_unlock(pte, ptl); + pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); + if (!pte) return 0; - } arch_enter_lazy_mmu_mode(); do { diff --git a/mm/rmap.c b/mm/rmap.c index 009a408797f9a6..1f95c591e5eb7c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1384,9 +1384,19 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, BUG_ON(!page || PageAnon(page)); if (locked_vma) { - mlock_vma_page(page); /* no-op if already mlocked */ - if (page == check_page) + if (page == check_page) { + /* we know we have check_page locked */ + mlock_vma_page(page); ret = SWAP_MLOCK; + } else if (trylock_page(page)) { + /* + * If we can lock the page, perform mlock. + * Otherwise leave the page alone, it will be + * eventually encountered again later. + */ + mlock_vma_page(page); + unlock_page(page); + } continue; /* don't unmap */ } diff --git a/security/device_cgroup.c b/security/device_cgroup.c index dd0dc574d78dd0..38cc16b74e840a 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -329,57 +329,139 @@ static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, } /** - * may_access - verifies if a new exception is part of what is allowed - * by a dev cgroup based on the default policy + - * exceptions. This is used to make sure a child cgroup - * won't have more privileges than its parent or to - * verify if a certain access is allowed. - * @dev_cgroup: dev cgroup to be tested against - * @refex: new exception - * @behavior: behavior of the exception + * match_exception - iterates the exception list trying to match a rule + * based on type, major, minor and access type. It is + * considered a match if an exception is found that + * will contain the entire range of provided parameters. + * @exceptions: list of exceptions + * @type: device type (DEV_BLOCK or DEV_CHAR) + * @major: device file major number, ~0 to match all + * @minor: device file minor number, ~0 to match all + * @access: permission mask (ACC_READ, ACC_WRITE, ACC_MKNOD) + * + * returns: true in case it matches an exception completely */ -static bool may_access(struct dev_cgroup *dev_cgroup, - struct dev_exception_item *refex, - enum devcg_behavior behavior) +static bool match_exception(struct list_head *exceptions, short type, + u32 major, u32 minor, short access) { struct dev_exception_item *ex; - bool match = false; - rcu_lockdep_assert(rcu_read_lock_held() || - lockdep_is_held(&devcgroup_mutex), - "device_cgroup::may_access() called without proper synchronization"); + list_for_each_entry_rcu(ex, exceptions, list) { + if ((type & DEV_BLOCK) && !(ex->type & DEV_BLOCK)) + continue; + if ((type & DEV_CHAR) && !(ex->type & DEV_CHAR)) + continue; + if (ex->major != ~0 && ex->major != major) + continue; + if (ex->minor != ~0 && ex->minor != minor) + continue; + /* provided access cannot have more than the exception rule */ + if (access & (~ex->access)) + continue; + return true; + } + return false; +} - list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) { - if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK)) +/** + * match_exception_partial - iterates the exception list trying to match a rule + * based on type, major, minor and access type. It is + * considered a match if an exception's range is + * found to contain *any* of the devices specified by + * provided parameters. This is used to make sure no + * extra access is being granted that is forbidden by + * any of the exception list. + * @exceptions: list of exceptions + * @type: device type (DEV_BLOCK or DEV_CHAR) + * @major: device file major number, ~0 to match all + * @minor: device file minor number, ~0 to match all + * @access: permission mask (ACC_READ, ACC_WRITE, ACC_MKNOD) + * + * returns: true in case the provided range mat matches an exception completely + */ +static bool match_exception_partial(struct list_head *exceptions, short type, + u32 major, u32 minor, short access) +{ + struct dev_exception_item *ex; + + list_for_each_entry_rcu(ex, exceptions, list) { + if ((type & DEV_BLOCK) && !(ex->type & DEV_BLOCK)) continue; - if ((refex->type & DEV_CHAR) && !(ex->type & DEV_CHAR)) + if ((type & DEV_CHAR) && !(ex->type & DEV_CHAR)) continue; - if (ex->major != ~0 && ex->major != refex->major) + /* + * We must be sure that both the exception and the provided + * range aren't masking all devices + */ + if (ex->major != ~0 && major != ~0 && ex->major != major) continue; - if (ex->minor != ~0 && ex->minor != refex->minor) + if (ex->minor != ~0 && minor != ~0 && ex->minor != minor) continue; - if (refex->access & (~ex->access)) + /* + * In order to make sure the provided range isn't matching + * an exception, all its access bits shouldn't match the + * exception's access bits + */ + if (!(access & ex->access)) continue; - match = true; - break; + return true; } + return false; +} + +/** + * verify_new_ex - verifies if a new exception is part of what is allowed + * by a dev cgroup based on the default policy + + * exceptions. This is used to make sure a child cgroup + * won't have more privileges than its parent + * @dev_cgroup: dev cgroup to be tested against + * @refex: new exception + * @behavior: behavior of the exception's dev_cgroup + */ +static bool verify_new_ex(struct dev_cgroup *dev_cgroup, + struct dev_exception_item *refex, + enum devcg_behavior behavior) +{ + bool match = false; + + rcu_lockdep_assert(rcu_read_lock_held() || + lockdep_is_held(&devcgroup_mutex), + "device_cgroup:verify_new_ex called without proper synchronization"); if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) { if (behavior == DEVCG_DEFAULT_ALLOW) { - /* the exception will deny access to certain devices */ + /* + * new exception in the child doesn't matter, only + * adding extra restrictions + */ return true; } else { - /* the exception will allow access to certain devices */ + /* + * new exception in the child will add more devices + * that can be acessed, so it can't match any of + * parent's exceptions, even slightly + */ + match = match_exception_partial(&dev_cgroup->exceptions, + refex->type, + refex->major, + refex->minor, + refex->access); + if (match) - /* - * a new exception allowing access shouldn't - * match an parent's exception - */ return false; return true; } } else { - /* only behavior == DEVCG_DEFAULT_DENY allowed here */ + /* + * Only behavior == DEVCG_DEFAULT_DENY allowed here, therefore + * the new exception will add access to more devices and must + * be contained completely in an parent's exception to be + * allowed + */ + match = match_exception(&dev_cgroup->exceptions, refex->type, + refex->major, refex->minor, + refex->access); + if (match) /* parent has an exception that matches the proposed */ return true; @@ -403,7 +485,35 @@ static int parent_has_perm(struct dev_cgroup *childcg, if (!pcg) return 1; parent = cgroup_to_devcgroup(pcg); - return may_access(parent, ex, childcg->behavior); + return verify_new_ex(parent, ex, childcg->behavior); +} + +/* + * parent_allows_removal - check if the parent cgroup allows an exception to + * be removed + * @childcg: child cgroup from where the exception will be removed + * @ex: exception being removed + */ +static bool parent_allows_removal(struct dev_cgroup *childcg, + struct dev_exception_item *ex) +{ + struct cgroup *pcg = childcg->css.cgroup->parent; + struct dev_cgroup *parent; + + if (!pcg) + return true; + parent = cgroup_to_devcgroup(pcg); + + if (childcg->behavior == DEVCG_DEFAULT_DENY) + /* It's always allowed to remove access to devices */ + return true; + + /* + * Make sure you're not removing part or a whole exception existing in + * the parent cgroup + */ + return !match_exception_partial(&parent->exceptions, ex->type, + ex->major, ex->minor, ex->access); } /** @@ -662,17 +772,21 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, switch (filetype) { case DEVCG_ALLOW: - if (!parent_has_perm(devcgroup, &ex)) - return -EPERM; /* * If the default policy is to allow by default, try to remove * an matching exception instead. And be silent about it: we * don't want to break compatibility */ if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { + /* Check if the parent allows removing it first */ + if (!parent_allows_removal(devcgroup, &ex)) + return -EPERM; dev_exception_rm(devcgroup, &ex); - return 0; + break; } + + if (!parent_has_perm(devcgroup, &ex)) + return -EPERM; rc = dev_exception_add(devcgroup, &ex); break; case DEVCG_DENY: @@ -753,18 +867,18 @@ static int __devcgroup_check_permission(short type, u32 major, u32 minor, short access) { struct dev_cgroup *dev_cgroup; - struct dev_exception_item ex; - int rc; - - memset(&ex, 0, sizeof(ex)); - ex.type = type; - ex.major = major; - ex.minor = minor; - ex.access = access; + bool rc; rcu_read_lock(); dev_cgroup = task_devcgroup(current); - rc = may_access(dev_cgroup, &ex, dev_cgroup->behavior); + if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) + /* Can't match any of the exceptions, even partially */ + rc = !match_exception_partial(&dev_cgroup->exceptions, + type, major, minor, access); + else + /* Need to match completely one exception to be allowed */ + rc = match_exception(&dev_cgroup->exceptions, type, major, + minor, access); rcu_read_unlock(); if (!rc)