Linux 3.4.78

md/raid10: fix two bugs in handling of known-bad-blocks.
commit b50c259e25 upstream. If we discover a bad block when reading we split the request and potentially read some of it from a different device. The code path of this has two bugs in RAID10. 1/ we get a spin_lock with _irq, but unlock without _irq!! 2/ The calculation of 'sectors_handled' is wrong, as can be clearly seen by comparison with raid1.c This leads to at least 2 warnings and a probable crash is a RAID10 ever had known bad blocks. Fixes: 856e08e237 Reported-by: Damian Nowak <spam@nowaker.net> URL: https://bugzilla.kernel.org/show_bug.cgi?id=68181 Signed-off-by: NeilBrown <neilb@suse.de> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2014-01-29 05:11:12 -08:00 · 2014-01-29 05:10:42 -08:00 · 2014-01-29 05:10:42 -08:00 · 2014-01-29 05:10:42 -08:00 · 2014-01-29 05:10:42 -08:00 · 2014-01-29 05:10:42 -08:00
12 changed files with 118 additions and 84 deletions
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 VERSION = 3
 PATCHLEVEL = 4
-SUBLEVEL = 77
+SUBLEVEL = 78
 EXTRAVERSION =
 NAME = Saber-toothed Squirrel

--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@ -9,6 +9,7 @@
 #include <linux/perf_event.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/syscore_ops.h>

 #include <asm/apic.h>

@ -209,6 +210,18 @@ out:
 	return ret;
 }

+static void ibs_eilvt_setup(void)
+{
+	/*
+	 * Force LVT offset assignment for family 10h: The offsets are
+	 * not assigned by the BIOS for this family, so the OS is
+	 * responsible for doing it. If the OS assignment fails, fall
+	 * back to BIOS settings and try to setup this.
+	 */
+	if (boot_cpu_data.x86 == 0x10)
+		force_ibs_eilvt_setup();
+}
+
 static inline int get_ibs_lvt_offset(void)
 {
 	u64 val;
@ -244,6 +257,36 @@ static void clear_APIC_ibs(void *dummy)
 		setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
 }

+#ifdef CONFIG_PM
+
+static int perf_ibs_suspend(void)
+{
+	clear_APIC_ibs(NULL);
+	return 0;
+}
+
+static void perf_ibs_resume(void)
+{
+	ibs_eilvt_setup();
+	setup_APIC_ibs(NULL);
+}
+
+static struct syscore_ops perf_ibs_syscore_ops = {
+	.resume		= perf_ibs_resume,
+	.suspend	= perf_ibs_suspend,
+};
+
+static void perf_ibs_pm_init(void)
+{
+	register_syscore_ops(&perf_ibs_syscore_ops);
+}
+
+#else
+
+static inline void perf_ibs_pm_init(void) { }
+
+#endif
+
 static int __cpuinit
 perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 {
@ -270,18 +313,12 @@ static __init int amd_ibs_init(void)
 	if (!caps)
 		return -ENODEV;	/* ibs not supported by the cpu */

-	/*
-	 * Force LVT offset assignment for family 10h: The offsets are
-	 * not assigned by the BIOS for this family, so the OS is
-	 * responsible for doing it. If the OS assignment fails, fall
-	 * back to BIOS settings and try to setup this.
-	 */
-	if (boot_cpu_data.x86 == 0x10)
-		force_ibs_eilvt_setup();
+	ibs_eilvt_setup();

 	if (!ibs_eilvt_valid())
 		goto out;

+	perf_ibs_pm_init();
 	get_online_cpus();
 	ibs_caps = caps;
 	/* make ibs_caps visible to other cpus: */
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@ -1278,14 +1278,12 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
 {
 	u32 data;
-	void *vapic;

 	if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
 		return;

-	vapic = kmap_atomic(vcpu->arch.apic->vapic_page);
-	data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
-	kunmap_atomic(vapic);
+	kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+			      sizeof(u32));

 	apic_set_tpr(vcpu->arch.apic, data & 0xff);
 }
@ -1295,7 +1293,6 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
 	u32 data, tpr;
 	int max_irr, max_isr;
 	struct kvm_lapic *apic;
-	void *vapic;

 	if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
 		return;
@ -1310,17 +1307,24 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
 		max_isr = 0;
 	data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);

-	vapic = kmap_atomic(vcpu->arch.apic->vapic_page);
-	*(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
-	kunmap_atomic(vapic);
+	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+			       sizeof(u32));
 }

-void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
+int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
 {
 	if (!irqchip_in_kernel(vcpu->kvm))
-		return;
+		return -EINVAL;
+
+	if (vapic_addr) {
+		if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+					&vcpu->arch.apic->vapic_cache,
+					vapic_addr, sizeof(u32)))
+			return -EINVAL;
+       }

 	vcpu->arch.apic->vapic_addr = vapic_addr;
+	return 0;
 }

 int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@ -15,7 +15,7 @@ struct kvm_lapic {
 	bool irr_pending;
 	void *regs;
 	gpa_t vapic_addr;
-	struct page *vapic_page;
+	struct gfn_to_hva_cache vapic_cache;
 };
 int kvm_create_lapic(struct kvm_vcpu *vcpu);
 void kvm_free_lapic(struct kvm_vcpu *vcpu);
@ -46,7 +46,7 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
 void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);

-void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
+int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
 void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);

--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@ -2728,8 +2728,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&va, argp, sizeof va))
 			goto out;
-		r = 0;
-		kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
+		r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
 		break;
 	}
 	case KVM_X86_SETUP_MCE: {
@ -5075,33 +5074,6 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
 			!kvm_event_needs_reinjection(vcpu);
 }

-static void vapic_enter(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = vcpu->arch.apic;
-	struct page *page;
-
-	if (!apic || !apic->vapic_addr)
-		return;
-
-	page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
-
-	vcpu->arch.apic->vapic_page = page;
-}
-
-static void vapic_exit(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = vcpu->arch.apic;
-	int idx;
-
-	if (!apic || !apic->vapic_addr)
-		return;
-
-	idx = srcu_read_lock(&vcpu->kvm->srcu);
-	kvm_release_page_dirty(apic->vapic_page);
-	mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
-	srcu_read_unlock(&vcpu->kvm->srcu, idx);
-}
-
 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 {
 	int max_irr, tpr;
@ -5385,7 +5357,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 	}

 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-	vapic_enter(vcpu);

 	r = 1;
 	while (r > 0) {
@ -5442,8 +5413,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)

 	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

-	vapic_exit(vcpu);
-
 	return r;
 }

--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@ -53,7 +53,7 @@ MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius");

 #define BASE_SYSFS_ATTR_NO	2	/* Sysfs Base attr no for coretemp */
 #define NUM_REAL_CORES		32	/* Number of Real cores per cpu */
-#define CORETEMP_NAME_LENGTH	17	/* String Length of attrs */
+#define CORETEMP_NAME_LENGTH	19	/* String Length of attrs */
 #define MAX_CORE_ATTRS		4	/* Maximum no of basic attrs */
 #define TOTAL_ATTRS		(MAX_CORE_ATTRS + 1)
 #define MAX_CORE_DATA		(NUM_REAL_CORES + BASE_SYSFS_ATTR_NO)
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@ -1117,7 +1117,7 @@ read_again:
 			/* Could not read all from this device, so we will
 			 * need another r10_bio.
 			 */
-			sectors_handled = (r10_bio->sectors + max_sectors
+			sectors_handled = (r10_bio->sector + max_sectors
 					   - bio->bi_sector);
 			r10_bio->sectors = max_sectors;
 			spin_lock_irq(&conf->device_lock);
@ -1125,7 +1125,7 @@ read_again:
 				bio->bi_phys_segments = 2;
 			else
 				bio->bi_phys_segments++;
-			spin_unlock(&conf->device_lock);
+			spin_unlock_irq(&conf->device_lock);
 			/* Cannot call generic_make_request directly
 			 * as that will be queued in __generic_make_request
 			 * and subsequent mempool_alloc might block
@ -2943,10 +2943,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 			if (j == conf->copies) {
 				/* Cannot recover, so abort the recovery or
 				 * record a bad block */
-				put_buf(r10_bio);
-				if (rb2)
-					atomic_dec(&rb2->remaining);
-				r10_bio = rb2;
 				if (any_working) {
 					/* problem is that there are bad blocks
 					 * on other device(s)
@ -2978,6 +2974,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 					mirror->recovery_disabled
 						= mddev->recovery_disabled;
 				}
+				put_buf(r10_bio);
+				if (rb2)
+					atomic_dec(&rb2->remaining);
+				r10_bio = rb2;
 				break;
 			}
 		}
--- a/drivers/staging/comedi/drivers/cb_pcidio.c
+++ b/drivers/staging/comedi/drivers/cb_pcidio.c
@ -56,10 +56,6 @@ struct pcidio_board {
 	const char *name;	/*  name of the board */
 	int dev_id;
 	int n_8255;		/*  number of 8255 chips on board */
-
-	/*  indices of base address regions */
-	int pcicontroler_badrindex;
-	int dioregs_badrindex;
 };

 static const struct pcidio_board pcidio_boards[] = {
@ -67,22 +63,16 @@ static const struct pcidio_board pcidio_boards[] = {
 	 .name = "pci-dio24",
 	 .dev_id = 0x0028,
 	 .n_8255 = 1,
-	 .pcicontroler_badrindex = 1,
-	 .dioregs_badrindex = 2,
 	 },
 	{
 	 .name = "pci-dio24h",
 	 .dev_id = 0x0014,
 	 .n_8255 = 1,
-	 .pcicontroler_badrindex = 1,
-	 .dioregs_badrindex = 2,
 	 },
 	{
 	 .name = "pci-dio48h",
 	 .dev_id = 0x000b,
 	 .n_8255 = 2,
-	 .pcicontroler_badrindex = 0,
-	 .dioregs_badrindex = 1,
 	 },
 };

@ -239,10 +229,15 @@ found:
 	if (comedi_pci_enable(pcidev, thisboard->name))
 		return -EIO;

-	devpriv->dio_reg_base
-	    =
+	/*
+	 * Use PCI BAR 2 region if non-zero length, else use PCI BAR 1 region.
+	 * PCI BAR 1 is only used for older PCI-DIO48H boards.  At some point
+	 * the PCI-DIO48H was redesigned to use the same PCI interface chip
+	 * (and same PCI BAR region) as the other boards.
+	 */
+	devpriv->dio_reg_base =
 	    pci_resource_start(devpriv->pci_dev,
-			       pcidio_boards[index].dioregs_badrindex);
+			       (pci_resource_len(pcidev, 2) ? 2 : 1));

 /*
 * Allocate the subdevice structures.  alloc_subdevice() is a
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@ -1436,17 +1436,19 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,

 		nilfs_clear_logs(&sci->sc_segbufs);

-		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
-		if (unlikely(err))
-			return err;
-
 		if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
 			err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
 							sci->sc_freesegs,
 							sci->sc_nfreesegs,
 							NULL);
 			WARN_ON(err); /* do not happen */
+			sci->sc_stage.flags &= ~NILFS_CF_SUFREED;
 		}
+
+		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+		if (unlikely(err))
+			return err;
+
 		nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
 		sci->sc_stage = prev_stage;
 	}
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@ -1447,10 +1447,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
 		return ret;
 	}
 done:
-	if (!PageHWPoison(hpage))
-		atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
-	set_page_hwpoison_huge_page(hpage);
-	dequeue_hwpoisoned_huge_page(hpage);
+	/* overcommit hugetlb page will be freed to buddy */
+	if (PageHuge(hpage)) {
+		if (!PageHWPoison(hpage))
+			atomic_long_add(1 << compound_trans_order(hpage),
+					&mce_bad_pages);
+		set_page_hwpoison_huge_page(hpage);
+		dequeue_hwpoisoned_huge_page(hpage);
+	} else {
+		SetPageHWPoison(page);
+		atomic_long_inc(&mce_bad_pages);
+	}
+
 	/* keep elevated page count for bad page */
 	return ret;
 }
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@ -218,6 +218,14 @@ static int inode_alloc_security(struct inode *inode)
 	return 0;
 }

+static void inode_free_rcu(struct rcu_head *head)
+{
+	struct inode_security_struct *isec;
+
+	isec = container_of(head, struct inode_security_struct, rcu);
+	kmem_cache_free(sel_inode_cache, isec);
+}
+
 static void inode_free_security(struct inode *inode)
 {
 	struct inode_security_struct *isec = inode->i_security;
@ -228,8 +236,16 @@ static void inode_free_security(struct inode *inode)
 		list_del_init(&isec->list);
 	spin_unlock(&sbsec->isec_lock);

-	inode->i_security = NULL;
-	kmem_cache_free(sel_inode_cache, isec);
+	/*
+	 * The inode may still be referenced in a path walk and
+	 * a call to selinux_inode_permission() can be made
+	 * after inode_free_security() is called. Ideally, the VFS
+	 * wouldn't do this, but fixing that is a much harder
+	 * job. For now, simply free the i_security via RCU, and
+	 * leave the current inode->i_security pointer intact.
+	 * The inode will be freed after the RCU grace period too.
+	 */
+	call_rcu(&isec->rcu, inode_free_rcu);
 }

 static int file_alloc_security(struct file *file)
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@ -38,7 +38,10 @@ struct task_security_struct {

 struct inode_security_struct {
 	struct inode *inode;	/* back pointer to inode object */
-	struct list_head list;	/* list of inode_security_struct */
+	union {
+		struct list_head list;	/* list of inode_security_struct */
+		struct rcu_head rcu;	/* for freeing the inode_security_struct */
+	};
 	u32 task_sid;		/* SID of creating task */
 	u32 sid;		/* SID of this object */
 	u16 sclass;		/* security class of this object */
Author	SHA1	Message	Date
Greg Kroah-Hartman	a13224074a	Linux 3.4.78	2014-01-29 05:11:12 -08:00
NeilBrown	7e34f43dcc	md/raid10: fix two bugs in handling of known-bad-blocks. commit `b50c259e25` upstream. If we discover a bad block when reading we split the request and potentially read some of it from a different device. The code path of this has two bugs in RAID10. 1/ we get a spin_lock with _irq, but unlock without _irq!! 2/ The calculation of 'sectors_handled' is wrong, as can be clearly seen by comparison with raid1.c This leads to at least 2 warnings and a probable crash is a RAID10 ever had known bad blocks. Fixes: `856e08e237` Reported-by: Damian Nowak <spam@nowaker.net> URL: https://bugzilla.kernel.org/show_bug.cgi?id=68181 Signed-off-by: NeilBrown <neilb@suse.de> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2014-01-29 05:10:42 -08:00
NeilBrown	511375d15b	md/raid10: fix bug when raid10 recovery fails to recover a block. commit `e8b8491585` upstream. commit `e875ecea26` md/raid10 record bad blocks as needed during recovery. added code to the "cannot recover this block" path to record a bad block rather than fail the whole recovery. Unfortunately this new case was placed after r10bio was freed rather than before, yet it still uses r10bio. This is will crash with a null dereference. So move the freeing of r10bio down where it is safe. Fixes: `e875ecea26` Reported-by: Damian Nowak <spam@nowaker.net> URL: https://bugzilla.kernel.org/show_bug.cgi?id=68181 Signed-off-by: NeilBrown <neilb@suse.de> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2014-01-29 05:10:42 -08:00
Andreas Rohner	ddcb318f63	nilfs2: fix segctor bug that causes file system corruption commit `70f2fe3a26` upstream. There is a bug in the function nilfs_segctor_collect, which results in active data being written to a segment, that is marked as clean. It is possible, that this segment is selected for a later segment construction, whereby the old data is overwritten. The problem shows itself with the following kernel log message: nilfs_sufile_do_cancel_free: segment 6533 must be clean Usually a few hours later the file system gets corrupted: NILFS: bad btree node (blocknr=8748107): level = 0, flags = 0x0, nchildren = 0 NILFS error (device sdc1): nilfs_bmap_last_key: broken bmap (inode number=114660) The issue can be reproduced with a file system that is nearly full and with the cleaner running, while some IO intensive task is running. Although it is quite hard to reproduce. This is what happens: 1. The cleaner starts the segment construction 2. nilfs_segctor_collect is called 3. sc_stage is on NILFS_ST_SUFILE and segments are freed 4. sc_stage is on NILFS_ST_DAT current segment is full 5. nilfs_segctor_extend_segments is called, which allocates a new segment 6. The new segment is one of the segments freed in step 3 7. nilfs_sufile_cancel_freev is called and produces an error message 8. Loop around and the collection starts again 9. sc_stage is on NILFS_ST_SUFILE and segments are freed including the newly allocated segment, which will contain active data and can be allocated at a later time 10. A few hours later another segment construction allocates the segment and causes file system corruption This can be prevented by simply reordering the statements. If nilfs_sufile_cancel_freev is called before nilfs_segctor_extend_segments the freed segments are marked as dirty and cannot be allocated any more. Signed-off-by: Andreas Rohner <andreas.rohner@gmx.net> Reviewed-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Tested-by: Andreas Rohner <andreas.rohner@gmx.net> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2014-01-29 05:10:42 -08:00
Steven Rostedt	9e74d93d65	SELinux: Fix possible NULL pointer dereference in selinux_inode_permission() commit `3dc91d4338` upstream. While running stress tests on adding and deleting ftrace instances I hit this bug: BUG: unable to handle kernel NULL pointer dereference at 0000000000000020 IP: selinux_inode_permission+0x85/0x160 PGD 63681067 PUD 7ddbe067 PMD 0 Oops: 0000 [#1] PREEMPT CPU: 0 PID: 5634 Comm: ftrace-test-mki Not tainted 3.13.0-rc4-test-00033-gd2a6dde-dirty #20 Hardware name: /DG965MQ, BIOS MQ96510J.86A.0372.2006.0605.1717 06/05/2006 task: ffff880078375800 ti: ffff88007ddb0000 task.ti: ffff88007ddb0000 RIP: 0010:[<ffffffff812d8bc5>] [<ffffffff812d8bc5>] selinux_inode_permission+0x85/0x160 RSP: 0018:ffff88007ddb1c48 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000800000 RCX: ffff88006dd43840 RDX: 0000000000000001 RSI: 0000000000000081 RDI: ffff88006ee46000 RBP: ffff88007ddb1c88 R08: 0000000000000000 R09: ffff88007ddb1c54 R10: 6e6576652f6f6f66 R11: 0000000000000003 R12: 0000000000000000 R13: 0000000000000081 R14: ffff88006ee46000 R15: 0000000000000000 FS: 00007f217b5b6700(0000) GS:ffffffff81e21000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033^M CR2: 0000000000000020 CR3: 000000006a0fe000 CR4: 00000000000007f0 Call Trace: security_inode_permission+0x1c/0x30 __inode_permission+0x41/0xa0 inode_permission+0x18/0x50 link_path_walk+0x66/0x920 path_openat+0xa6/0x6c0 do_filp_open+0x43/0xa0 do_sys_open+0x146/0x240 SyS_open+0x1e/0x20 system_call_fastpath+0x16/0x1b Code: 84 a1 00 00 00 81 e3 00 20 00 00 89 d8 83 c8 02 40 f6 c6 04 0f 45 d8 40 f6 c6 08 74 71 80 cf 02 49 8b 46 38 4c 8d 4d cc 45 31 c0 <0f> b7 50 20 8b 70 1c 48 8b 41 70 89 d9 8b 78 04 e8 36 cf ff ff RIP selinux_inode_permission+0x85/0x160 CR2: 0000000000000020 Investigating, I found that the inode->i_security was NULL, and the dereference of it caused the oops. in selinux_inode_permission(): isec = inode->i_security; rc = avc_has_perm_noaudit(sid, isec->sid, isec->sclass, perms, 0, &avd); Note, the crash came from stressing the deletion and reading of debugfs files. I was not able to recreate this via normal files. But I'm not sure they are safe. It may just be that the race window is much harder to hit. What seems to have happened (and what I have traced), is the file is being opened at the same time the file or directory is being deleted. As the dentry and inode locks are not held during the path walk, nor is the inodes ref counts being incremented, there is nothing saving these structures from being discarded except for an rcu_read_lock(). The rcu_read_lock() protects against freeing of the inode, but it does not protect freeing of the inode_security_struct. Now if the freeing of the i_security happens with a call_rcu(), and the i_security field of the inode is not changed (it gets freed as the inode gets freed) then there will be no issue here. (Linus Torvalds suggested not setting the field to NULL such that we do not need to check if it is NULL in the permission check). Note, this is a hack, but it fixes the problem at hand. A real fix is to restructure the destroy_inode() to call all the destructor handlers from the RCU callback. But that is a major job to do, and requires a lot of work. For now, we just band-aid this bug with this fix (it works), and work on a more maintainable solution in the future. Link: http://lkml.kernel.org/r/20140109101932.0508dec7@gandalf.local.home Link: http://lkml.kernel.org/r/20140109182756.17abaaa8@gandalf.local.home Signed-off-by: Steven Rostedt <rostedt@goodmis.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2014-01-29 05:10:42 -08:00
Jean Delvare	e34cdde46b	hwmon: (coretemp) Fix truncated name of alarm attributes commit `3f9aec7610` upstream. When the core number exceeds 9, the size of the buffer storing the alarm attribute name is insufficient and the attribute name is truncated. This causes libsensors to skip these attributes as the truncated name is not recognized. Reported-by: Andreas Hollmann <hollmann@in.tum.de> Signed-off-by: Jean Delvare <khali@linux-fr.org> Signed-off-by: Guenter Roeck <linux@roeck-us.net> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2014-01-29 05:10:42 -08:00
Jianguo Wu	9a22404cf9	mm/memory-failure.c: recheck PageHuge() after hugetlb page migrate successfully commit `a49ecbcd7b` upstream. After a successful hugetlb page migration by soft offline, the source page will either be freed into hugepage_freelists or buddy(over-commit page). If page is in buddy, page_hstate(page) will be NULL. It will hit a NULL pointer dereference in dequeue_hwpoisoned_huge_page(). BUG: unable to handle kernel NULL pointer dereference at 0000000000000058 IP: [<ffffffff81163761>] dequeue_hwpoisoned_huge_page+0x131/0x1d0 PGD c23762067 PUD c24be2067 PMD 0 Oops: 0000 [#1] SMP So check PageHuge(page) after call migrate_pages() successfully. [wujg: backport to 3.4: - adjust context - s/num_poisoned_pages/mce_bad_pages/] Signed-off-by: Jianguo Wu <wujianguo@huawei.com> Tested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2014-01-29 05:10:42 -08:00
Robert Richter	7e99f21684	perf/x86/amd/ibs: Fix waking up from S3 for AMD family 10h commit `bee09ed91c` upstream. On AMD family 10h we see following error messages while waking up from S3 for all non-boot CPUs leading to a failed IBS initialization: Enabling non-boot CPUs ... smpboot: Booting Node 0 Processor 1 APIC 0x1 [Firmware Bug]: cpu 1, try to use APIC500 (LVT offset 0) for vector 0x400, but the register is already in use for vector 0xf9 on another cpu perf: IBS APIC setup failed on cpu #1 process: Switch to broadcast mode on CPU1 CPU1 is up ... ACPI: Waking up from system sleep state S3 Reason for this is that during suspend the LVT offset for the IBS vector gets lost and needs to be reinialized while resuming. The offset is read from the IBSCTL msr. On family 10h the offset needs to be 1 as offset 0 is used for the MCE threshold interrupt, but firmware assings it for IBS to 0 too. The kernel needs to reprogram the vector. The msr is a readonly node msr, but a new value can be written via pci config space access. The reinitialization is implemented for family 10h in setup_ibs_ctl() which is forced during IBS setup. This patch fixes IBS setup after waking up from S3 by adding resume/supend hooks for the boot cpu which does the offset reinitialization. Marking it as stable to let distros pick up this fix. Signed-off-by: Robert Richter <rric@kernel.org> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/1389797849-5565-1-git-send-email-rric.net@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2014-01-29 05:10:42 -08:00
Ian Abbott	6013932c94	staging: comedi: 8255_pci: fix for newer PCI-DIO48H commit `0283f7a100` upstream. At some point, Measurement Computing / ComputerBoards redesigned the PCI-DIO48H to use a PLX PCI interface chip instead of an AMCC chip. This meant they had to put their hardware registers in the PCI BAR 2 region instead of PCI BAR 1. Unfortunately, they kept the same PCI device ID for the new design. This means the driver recognizes the newer cards, but doesn't work (and is likely to screw up the local configuration registers of the PLX chip) because it's using the wrong region. Since the PCI subvendor and subdevice IDs were both zero on the old design, but are the same as the vendor and device on the new design, we can tell the old design and new design apart easily enough. Split the existing entry for the PCI-DIO48H in `pci_8255_boards[]` into two new entries, referenced by different entries in the PCI device ID table `pci_8255_pci_table[]`. Use the same board name for both entries. Signed-off-by: Ian Abbott <abbotti@mev.co.uk> Acked-by: H Hartley Sweeten <hsweeten@visionengravers.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2014-01-29 05:10:42 -08:00
Andy Honig	777f8f3bcd	KVM: x86: Convert vapic synchronization to _cached functions (CVE-2013-6368) commit `fda4e2e855` upstream. In kvm_lapic_sync_from_vapic and kvm_lapic_sync_to_vapic there is the potential to corrupt kernel memory if userspace provides an address that is at the end of a page. This patches concerts those functions to use kvm_write_guest_cached and kvm_read_guest_cached. It also checks the vapic_address specified by userspace during ioctl processing and returns an error to userspace if the address is not a valid GPA. This is generally not guest triggerable, because the required write is done by firmware that runs before the guest. Also, it only affects AMD processors and oldish Intel that do not have the FlexPriority feature (unless you disable FlexPriority, of course; then newer processors are also affected). Fixes: `b93463aa59` ('KVM: Accelerated apic support') Reported-by: Andrew Honig <ahonig@google.com> Cc: stable@vger.kernel.org Signed-off-by: Andrew Honig <ahonig@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> [ lizf: backported to 3.4: based on Paolo's backport hints for <3.10 ] Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2014-01-29 05:10:42 -08:00