From 2e04ef76916d1e29a077ea9d0f2003c8fd86724d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 30 Jul 2009 16:03:45 -0600 Subject: lguest: fix comment style I don't really notice it (except to begrudge the extra vertical space), but Ingo does. And he pointed out that one excuse of lguest is as a teaching tool, it should set a good example. Signed-off-by: Rusty Russell Cc: Ingo Molnar --- drivers/lguest/x86/core.c | 372 ++++++++++++++++++++++++++++++---------------- 1 file changed, 245 insertions(+), 127 deletions(-) (limited to 'drivers/lguest/x86/core.c') diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index eaf722fe309a..96f7d88ec7f8 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -17,13 +17,15 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -/*P:450 This file contains the x86-specific lguest code. It used to be all +/*P:450 + * This file contains the x86-specific lguest code. It used to be all * mixed in with drivers/lguest/core.c but several foolhardy code slashers * wrestled most of the dependencies out to here in preparation for porting * lguest to other architectures (see what I mean by foolhardy?). * * This also contains a couple of non-obvious setup and teardown pieces which - * were implemented after days of debugging pain. :*/ + * were implemented after days of debugging pain. +:*/ #include #include #include @@ -82,25 +84,33 @@ static DEFINE_PER_CPU(struct lg_cpu *, last_cpu); */ static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) { - /* Copying all this data can be quite expensive. We usually run the + /* + * Copying all this data can be quite expensive. We usually run the * same Guest we ran last time (and that Guest hasn't run anywhere else * meanwhile). If that's not the case, we pretend everything in the - * Guest has changed. */ + * Guest has changed. + */ if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) { __get_cpu_var(last_cpu) = cpu; cpu->last_pages = pages; cpu->changed = CHANGED_ALL; } - /* These copies are pretty cheap, so we do them unconditionally: */ - /* Save the current Host top-level page directory. */ + /* + * These copies are pretty cheap, so we do them unconditionally: */ + /* Save the current Host top-level page directory. + */ pages->state.host_cr3 = __pa(current->mm->pgd); - /* Set up the Guest's page tables to see this CPU's pages (and no - * other CPU's pages). */ + /* + * Set up the Guest's page tables to see this CPU's pages (and no + * other CPU's pages). + */ map_switcher_in_guest(cpu, pages); - /* Set up the two "TSS" members which tell the CPU what stack to use + /* + * Set up the two "TSS" members which tell the CPU what stack to use * for traps which do directly into the Guest (ie. traps at privilege - * level 1). */ + * level 1). + */ pages->state.guest_tss.sp1 = cpu->esp1; pages->state.guest_tss.ss1 = cpu->ss1; @@ -125,40 +135,53 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) /* This is a dummy value we need for GCC's sake. */ unsigned int clobber; - /* Copy the guest-specific information into this CPU's "struct - * lguest_pages". */ + /* + * Copy the guest-specific information into this CPU's "struct + * lguest_pages". + */ copy_in_guest_info(cpu, pages); - /* Set the trap number to 256 (impossible value). If we fault while + /* + * Set the trap number to 256 (impossible value). If we fault while * switching to the Guest (bad segment registers or bug), this will - * cause us to abort the Guest. */ + * cause us to abort the Guest. + */ cpu->regs->trapnum = 256; - /* Now: we push the "eflags" register on the stack, then do an "lcall". + /* + * Now: we push the "eflags" register on the stack, then do an "lcall". * This is how we change from using the kernel code segment to using * the dedicated lguest code segment, as well as jumping into the * Switcher. * * The lcall also pushes the old code segment (KERNEL_CS) onto the * stack, then the address of this call. This stack layout happens to - * exactly match the stack layout created by an interrupt... */ + * exactly match the stack layout created by an interrupt... + */ asm volatile("pushf; lcall *lguest_entry" - /* This is how we tell GCC that %eax ("a") and %ebx ("b") - * are changed by this routine. The "=" means output. */ + /* + * This is how we tell GCC that %eax ("a") and %ebx ("b") + * are changed by this routine. The "=" means output. + */ : "=a"(clobber), "=b"(clobber) - /* %eax contains the pages pointer. ("0" refers to the + /* + * %eax contains the pages pointer. ("0" refers to the * 0-th argument above, ie "a"). %ebx contains the * physical address of the Guest's top-level page - * directory. */ + * directory. + */ : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) - /* We tell gcc that all these registers could change, + /* + * We tell gcc that all these registers could change, * which means we don't have to save and restore them in - * the Switcher. */ + * the Switcher. + */ : "memory", "%edx", "%ecx", "%edi", "%esi"); } /*:*/ -/*M:002 There are hooks in the scheduler which we can register to tell when we +/*M:002 + * There are hooks in the scheduler which we can register to tell when we * get kicked off the CPU (preempt_notifier_register()). This would allow us * to lazily disable SYSENTER which would regain some performance, and should * also simplify copy_in_guest_info(). Note that we'd still need to restore @@ -166,56 +189,72 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) * * We could also try using this hooks for PGE, but that might be too expensive. * - * The hooks were designed for KVM, but we can also put them to good use. :*/ + * The hooks were designed for KVM, but we can also put them to good use. +:*/ -/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts - * are disabled: we own the CPU. */ +/*H:040 + * This is the i386-specific code to setup and run the Guest. Interrupts + * are disabled: we own the CPU. + */ void lguest_arch_run_guest(struct lg_cpu *cpu) { - /* Remember the awfully-named TS bit? If the Guest has asked to set it + /* + * Remember the awfully-named TS bit? If the Guest has asked to set it * we set it now, so we can trap and pass that trap to the Guest if it - * uses the FPU. */ + * uses the FPU. + */ if (cpu->ts) unlazy_fpu(current); - /* SYSENTER is an optimized way of doing system calls. We can't allow + /* + * SYSENTER is an optimized way of doing system calls. We can't allow * it because it always jumps to privilege level 0. A normal Guest * won't try it because we don't advertise it in CPUID, but a malicious * Guest (or malicious Guest userspace program) could, so we tell the - * CPU to disable it before running the Guest. */ + * CPU to disable it before running the Guest. + */ if (boot_cpu_has(X86_FEATURE_SEP)) wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); - /* Now we actually run the Guest. It will return when something + /* + * Now we actually run the Guest. It will return when something * interesting happens, and we can examine its registers to see what it - * was doing. */ + * was doing. + */ run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); - /* Note that the "regs" structure contains two extra entries which are + /* + * Note that the "regs" structure contains two extra entries which are * not really registers: a trap number which says what interrupt or * trap made the switcher code come back, and an error code which some - * traps set. */ + * traps set. + */ /* Restore SYSENTER if it's supposed to be on. */ if (boot_cpu_has(X86_FEATURE_SEP)) wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - /* If the Guest page faulted, then the cr2 register will tell us the + /* + * If the Guest page faulted, then the cr2 register will tell us the * bad virtual address. We have to grab this now, because once we * re-enable interrupts an interrupt could fault and thus overwrite - * cr2, or we could even move off to a different CPU. */ + * cr2, or we could even move off to a different CPU. + */ if (cpu->regs->trapnum == 14) cpu->arch.last_pagefault = read_cr2(); - /* Similarly, if we took a trap because the Guest used the FPU, + /* + * Similarly, if we took a trap because the Guest used the FPU, * we have to restore the FPU it expects to see. * math_state_restore() may sleep and we may even move off to * a different CPU. So all the critical stuff should be done - * before this. */ + * before this. + */ else if (cpu->regs->trapnum == 7) math_state_restore(); } -/*H:130 Now we've examined the hypercall code; our Guest can make requests. +/*H:130 + * Now we've examined the hypercall code; our Guest can make requests. * Our Guest is usually so well behaved; it never tries to do things it isn't * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual * infrastructure isn't quite complete, because it doesn't contain replacements @@ -225,26 +264,33 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * * When the Guest uses one of these instructions, we get a trap (General * Protection Fault) and come here. We see if it's one of those troublesome - * instructions and skip over it. We return true if we did. */ + * instructions and skip over it. We return true if we did. + */ static int emulate_insn(struct lg_cpu *cpu) { u8 insn; unsigned int insnlen = 0, in = 0, shift = 0; - /* The eip contains the *virtual* address of the Guest's instruction: - * guest_pa just subtracts the Guest's page_offset. */ + /* + * The eip contains the *virtual* address of the Guest's instruction: + * guest_pa just subtracts the Guest's page_offset. + */ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); - /* This must be the Guest kernel trying to do something, not userspace! + /* + * This must be the Guest kernel trying to do something, not userspace! * The bottom two bits of the CS segment register are the privilege - * level. */ + * level. + */ if ((cpu->regs->cs & 3) != GUEST_PL) return 0; /* Decoding x86 instructions is icky. */ insn = lgread(cpu, physaddr, u8); - /* 0x66 is an "operand prefix". It means it's using the upper 16 bits - of the eax register. */ + /* + * 0x66 is an "operand prefix". It means it's using the upper 16 bits + * of the eax register. + */ if (insn == 0x66) { shift = 16; /* The instruction is 1 byte so far, read the next byte. */ @@ -252,8 +298,10 @@ static int emulate_insn(struct lg_cpu *cpu) insn = lgread(cpu, physaddr + insnlen, u8); } - /* We can ignore the lower bit for the moment and decode the 4 opcodes - * we need to emulate. */ + /* + * We can ignore the lower bit for the moment and decode the 4 opcodes + * we need to emulate. + */ switch (insn & 0xFE) { case 0xE4: /* in ,%al */ insnlen += 2; @@ -274,9 +322,11 @@ static int emulate_insn(struct lg_cpu *cpu) return 0; } - /* If it was an "IN" instruction, they expect the result to be read + /* + * If it was an "IN" instruction, they expect the result to be read * into %eax, so we change %eax. We always return all-ones, which - * traditionally means "there's nothing there". */ + * traditionally means "there's nothing there". + */ if (in) { /* Lower bit tells is whether it's a 16 or 32 bit access */ if (insn & 0x1) @@ -290,7 +340,8 @@ static int emulate_insn(struct lg_cpu *cpu) return 1; } -/* Our hypercalls mechanism used to be based on direct software interrupts. +/* + * Our hypercalls mechanism used to be based on direct software interrupts. * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to * change over to using kvm hypercalls. * @@ -318,16 +369,20 @@ static int emulate_insn(struct lg_cpu *cpu) */ static void rewrite_hypercall(struct lg_cpu *cpu) { - /* This are the opcodes we use to patch the Guest. The opcode for "int + /* + * This are the opcodes we use to patch the Guest. The opcode for "int * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we - * complete the sequence with a NOP (0x90). */ + * complete the sequence with a NOP (0x90). + */ u8 insn[3] = {0xcd, 0x1f, 0x90}; __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); - /* The above write might have caused a copy of that page to be made + /* + * The above write might have caused a copy of that page to be made * (if it was read-only). We need to make sure the Guest has * up-to-date pagetables. As this doesn't happen often, we can just - * drop them all. */ + * drop them all. + */ guest_pagetable_clear_all(cpu); } @@ -335,9 +390,11 @@ static bool is_hypercall(struct lg_cpu *cpu) { u8 insn[3]; - /* This must be the Guest kernel trying to do something. + /* + * This must be the Guest kernel trying to do something. * The bottom two bits of the CS segment register are the privilege - * level. */ + * level. + */ if ((cpu->regs->cs & 3) != GUEST_PL) return false; @@ -351,86 +408,105 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) { switch (cpu->regs->trapnum) { case 13: /* We've intercepted a General Protection Fault. */ - /* Check if this was one of those annoying IN or OUT + /* + * Check if this was one of those annoying IN or OUT * instructions which we need to emulate. If so, we just go - * back into the Guest after we've done it. */ + * back into the Guest after we've done it. + */ if (cpu->regs->errcode == 0) { if (emulate_insn(cpu)) return; } - /* If KVM is active, the vmcall instruction triggers a - * General Protection Fault. Normally it triggers an - * invalid opcode fault (6): */ + /* + * If KVM is active, the vmcall instruction triggers a General + * Protection Fault. Normally it triggers an invalid opcode + * fault (6): + */ case 6: - /* We need to check if ring == GUEST_PL and - * faulting instruction == vmcall. */ + /* + * We need to check if ring == GUEST_PL and faulting + * instruction == vmcall. + */ if (is_hypercall(cpu)) { rewrite_hypercall(cpu); return; } break; case 14: /* We've intercepted a Page Fault. */ - /* The Guest accessed a virtual address that wasn't mapped. + /* + * The Guest accessed a virtual address that wasn't mapped. * This happens a lot: we don't actually set up most of the page * tables for the Guest at all when we start: as it runs it asks * for more and more, and we set them up as required. In this * case, we don't even tell the Guest that the fault happened. * * The errcode tells whether this was a read or a write, and - * whether kernel or userspace code. */ + * whether kernel or userspace code. + */ if (demand_page(cpu, cpu->arch.last_pagefault, cpu->regs->errcode)) return; - /* OK, it's really not there (or not OK): the Guest needs to + /* + * OK, it's really not there (or not OK): the Guest needs to * know. We write out the cr2 value so it knows where the * fault occurred. * * Note that if the Guest were really messed up, this could * happen before it's done the LHCALL_LGUEST_INIT hypercall, so - * lg->lguest_data could be NULL */ + * lg->lguest_data could be NULL + */ if (cpu->lg->lguest_data && put_user(cpu->arch.last_pagefault, &cpu->lg->lguest_data->cr2)) kill_guest(cpu, "Writing cr2"); break; case 7: /* We've intercepted a Device Not Available fault. */ - /* If the Guest doesn't want to know, we already restored the - * Floating Point Unit, so we just continue without telling - * it. */ + /* + * If the Guest doesn't want to know, we already restored the + * Floating Point Unit, so we just continue without telling it. + */ if (!cpu->ts) return; break; case 32 ... 255: - /* These values mean a real interrupt occurred, in which case + /* + * These values mean a real interrupt occurred, in which case * the Host handler has already been run. We just do a * friendly check if another process should now be run, then - * return to run the Guest again */ + * return to run the Guest again + */ cond_resched(); return; case LGUEST_TRAP_ENTRY: - /* Our 'struct hcall_args' maps directly over our regs: we set - * up the pointer now to indicate a hypercall is pending. */ + /* + * Our 'struct hcall_args' maps directly over our regs: we set + * up the pointer now to indicate a hypercall is pending. + */ cpu->hcall = (struct hcall_args *)cpu->regs; return; } /* We didn't handle the trap, so it needs to go to the Guest. */ if (!deliver_trap(cpu, cpu->regs->trapnum)) - /* If the Guest doesn't have a handler (either it hasn't + /* + * If the Guest doesn't have a handler (either it hasn't * registered any yet, or it's one of the faults we don't let - * it handle), it dies with this cryptic error message. */ + * it handle), it dies with this cryptic error message. + */ kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", cpu->regs->trapnum, cpu->regs->eip, cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault : cpu->regs->errcode); } -/* Now we can look at each of the routines this calls, in increasing order of +/* + * Now we can look at each of the routines this calls, in increasing order of * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), * deliver_trap() and demand_page(). After all those, we'll be ready to * examine the Switcher, and our philosophical understanding of the Host/Guest - * duality will be complete. :*/ + * duality will be complete. +:*/ static void adjust_pge(void *on) { if (on) @@ -439,13 +515,16 @@ static void adjust_pge(void *on) write_cr4(read_cr4() & ~X86_CR4_PGE); } -/*H:020 Now the Switcher is mapped and every thing else is ready, we need to do - * some more i386-specific initialization. */ +/*H:020 + * Now the Switcher is mapped and every thing else is ready, we need to do + * some more i386-specific initialization. + */ void __init lguest_arch_host_init(void) { int i; - /* Most of the i386/switcher.S doesn't care that it's been moved; on + /* + * Most of the i386/switcher.S doesn't care that it's been moved; on * Intel, jumps are relative, and it doesn't access any references to * external code or data. * @@ -453,7 +532,8 @@ void __init lguest_arch_host_init(void) * addresses are placed in a table (default_idt_entries), so we need to * update the table with the new addresses. switcher_offset() is a * convenience function which returns the distance between the - * compiled-in switcher code and the high-mapped copy we just made. */ + * compiled-in switcher code and the high-mapped copy we just made. + */ for (i = 0; i < IDT_ENTRIES; i++) default_idt_entries[i] += switcher_offset(); @@ -468,63 +548,81 @@ void __init lguest_arch_host_init(void) for_each_possible_cpu(i) { /* lguest_pages() returns this CPU's two pages. */ struct lguest_pages *pages = lguest_pages(i); - /* This is a convenience pointer to make the code fit one - * statement to a line. */ + /* This is a convenience pointer to make the code neater. */ struct lguest_ro_state *state = &pages->state; - /* The Global Descriptor Table: the Host has a different one + /* + * The Global Descriptor Table: the Host has a different one * for each CPU. We keep a descriptor for the GDT which says * where it is and how big it is (the size is actually the last - * byte, not the size, hence the "-1"). */ + * byte, not the size, hence the "-1"). + */ state->host_gdt_desc.size = GDT_SIZE-1; state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); - /* All CPUs on the Host use the same Interrupt Descriptor + /* + * All CPUs on the Host use the same Interrupt Descriptor * Table, so we just use store_idt(), which gets this CPU's IDT - * descriptor. */ + * descriptor. + */ store_idt(&state->host_idt_desc); - /* The descriptors for the Guest's GDT and IDT can be filled + /* + * The descriptors for the Guest's GDT and IDT can be filled * out now, too. We copy the GDT & IDT into ->guest_gdt and - * ->guest_idt before actually running the Guest. */ + * ->guest_idt before actually running the Guest. + */ state->guest_idt_desc.size = sizeof(state->guest_idt)-1; state->guest_idt_desc.address = (long)&state->guest_idt; state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; state->guest_gdt_desc.address = (long)&state->guest_gdt; - /* We know where we want the stack to be when the Guest enters + /* + * We know where we want the stack to be when the Guest enters * the Switcher: in pages->regs. The stack grows upwards, so - * we start it at the end of that structure. */ + * we start it at the end of that structure. + */ state->guest_tss.sp0 = (long)(&pages->regs + 1); - /* And this is the GDT entry to use for the stack: we keep a - * couple of special LGUEST entries. */ + /* + * And this is the GDT entry to use for the stack: we keep a + * couple of special LGUEST entries. + */ state->guest_tss.ss0 = LGUEST_DS; - /* x86 can have a finegrained bitmap which indicates what I/O + /* + * x86 can have a finegrained bitmap which indicates what I/O * ports the process can use. We set it to the end of our - * structure, meaning "none". */ + * structure, meaning "none". + */ state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); - /* Some GDT entries are the same across all Guests, so we can - * set them up now. */ + /* + * Some GDT entries are the same across all Guests, so we can + * set them up now. + */ setup_default_gdt_entries(state); /* Most IDT entries are the same for all Guests, too.*/ setup_default_idt_entries(state, default_idt_entries); - /* The Host needs to be able to use the LGUEST segments on this - * CPU, too, so put them in the Host GDT. */ + /* + * The Host needs to be able to use the LGUEST segments on this + * CPU, too, so put them in the Host GDT. + */ get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; } - /* In the Switcher, we want the %cs segment register to use the + /* + * In the Switcher, we want the %cs segment register to use the * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so * it will be undisturbed when we switch. To change %cs and jump we - * need this structure to feed to Intel's "lcall" instruction. */ + * need this structure to feed to Intel's "lcall" instruction. + */ lguest_entry.offset = (long)switch_to_guest + switcher_offset(); lguest_entry.segment = LGUEST_CS; - /* Finally, we need to turn off "Page Global Enable". PGE is an + /* + * Finally, we need to turn off "Page Global Enable". PGE is an * optimization where page table entries are specially marked to show * they never change. The Host kernel marks all the kernel pages this * way because it's always present, even when userspace is running. @@ -534,16 +632,21 @@ void __init lguest_arch_host_init(void) * you'll get really weird bugs that you'll chase for two days. * * I used to turn PGE off every time we switched to the Guest and back - * on when we return, but that slowed the Switcher down noticibly. */ + * on when we return, but that slowed the Switcher down noticibly. + */ - /* We don't need the complexity of CPUs coming and going while we're - * doing this. */ + /* + * We don't need the complexity of CPUs coming and going while we're + * doing this. + */ get_online_cpus(); if (cpu_has_pge) { /* We have a broader idea of "global". */ /* Remember that this was originally set (for cleanup). */ cpu_had_pge = 1; - /* adjust_pge is a helper function which sets or unsets the PGE - * bit on its CPU, depending on the argument (0 == unset). */ + /* + * adjust_pge is a helper function which sets or unsets the PGE + * bit on its CPU, depending on the argument (0 == unset). + */ on_each_cpu(adjust_pge, (void *)0, 1); /* Turn off the feature in the global feature set. */ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); @@ -590,26 +693,32 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) { u32 tsc_speed; - /* The pointer to the Guest's "struct lguest_data" is the only argument. - * We check that address now. */ + /* + * The pointer to the Guest's "struct lguest_data" is the only argument. + * We check that address now. + */ if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, sizeof(*cpu->lg->lguest_data))) return -EFAULT; - /* Having checked it, we simply set lg->lguest_data to point straight + /* + * Having checked it, we simply set lg->lguest_data to point straight * into the Launcher's memory at the right place and then use * copy_to_user/from_user from now on, instead of lgread/write. I put * this in to show that I'm not immune to writing stupid - * optimizations. */ + * optimizations. + */ cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; - /* We insist that the Time Stamp Counter exist and doesn't change with + /* + * We insist that the Time Stamp Counter exist and doesn't change with * cpu frequency. Some devious chip manufacturers decided that TSC * changes could be handled in software. I decided that time going * backwards might be good for benchmarks, but it's bad for users. * * We also insist that the TSC be stable: the kernel detects unreliable - * TSCs for its own purposes, and we use that here. */ + * TSCs for its own purposes, and we use that here. + */ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) tsc_speed = tsc_khz; else @@ -625,38 +734,47 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) } /*:*/ -/*L:030 lguest_arch_setup_regs() +/*L:030 + * lguest_arch_setup_regs() * * Most of the Guest's registers are left alone: we used get_zeroed_page() to - * allocate the structure, so they will be 0. */ + * allocate the structure, so they will be 0. + */ void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) { struct lguest_regs *regs = cpu->regs; - /* There are four "segment" registers which the Guest needs to boot: + /* + * There are four "segment" registers which the Guest needs to boot: * The "code segment" register (cs) refers to the kernel code segment * __KERNEL_CS, and the "data", "extra" and "stack" segment registers * refer to the kernel data segment __KERNEL_DS. * * The privilege level is packed into the lower bits. The Guest runs - * at privilege level 1 (GUEST_PL).*/ + * at privilege level 1 (GUEST_PL). + */ regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; regs->cs = __KERNEL_CS|GUEST_PL; - /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) + /* + * The "eflags" register contains miscellaneous flags. Bit 1 (0x002) * is supposed to always be "1". Bit 9 (0x200) controls whether * interrupts are enabled. We always leave interrupts enabled while - * running the Guest. */ + * running the Guest. + */ regs->eflags = X86_EFLAGS_IF | 0x2; - /* The "Extended Instruction Pointer" register says where the Guest is - * running. */ + /* + * The "Extended Instruction Pointer" register says where the Guest is + * running. + */ regs->eip = start; - /* %esi points to our boot information, at physical address 0, so don't - * touch it. */ + /* + * %esi points to our boot information, at physical address 0, so don't + * touch it. + */ - /* There are a couple of GDT entries the Guest expects when first - * booting. */ + /* There are a couple of GDT entries the Guest expects at boot. */ setup_guest_gdt(cpu); } -- cgit v1.2.3 From a91d74a3c4de8115295ee87350c13a329164aaaf Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 30 Jul 2009 16:03:45 -0600 Subject: lguest: update commentry Every so often, after code shuffles, I need to go through and unbitrot the Lguest Journey (see drivers/lguest/README). Since we now use RCU in a simple form in one place I took the opportunity to expand that explanation. Signed-off-by: Rusty Russell Cc: Ingo Molnar Cc: Paul McKenney --- Documentation/lguest/lguest.c | 184 +++++++++++++++++++++++++++--------- arch/x86/include/asm/lguest_hcall.h | 8 +- arch/x86/lguest/boot.c | 99 ++++++++++++++----- arch/x86/lguest/i386_head.S | 2 + drivers/lguest/core.c | 7 +- drivers/lguest/hypercalls.c | 6 +- drivers/lguest/lguest_device.c | 11 ++- drivers/lguest/lguest_user.c | 100 ++++++++++++++++++-- drivers/lguest/page_tables.c | 84 ++++++++++++---- drivers/lguest/x86/core.c | 2 +- drivers/lguest/x86/switcher_32.S | 6 +- 11 files changed, 398 insertions(+), 111 deletions(-) (limited to 'drivers/lguest/x86/core.c') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index aa66a52b73e9..45163651b519 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -49,7 +49,7 @@ #include "linux/virtio_ring.h" #include "asm/bootparam.h" /*L:110 - * We can ignore the 39 include files we need for this program, but I do want + * We can ignore the 42 include files we need for this program, but I do want * to draw attention to the use of kernel-style types. * * As Linus said, "C is a Spartan language, and so should your naming be." I @@ -305,6 +305,11 @@ static void *map_zeroed_pages(unsigned int num) PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); if (addr == MAP_FAILED) err(1, "Mmaping %u pages of /dev/zero", num); + + /* + * One neat mmap feature is that you can close the fd, and it + * stays mapped. + */ close(fd); return addr; @@ -557,7 +562,7 @@ static void tell_kernel(unsigned long start) } /*:*/ -/* +/*L:200 * Device Handling. * * When the Guest gives us a buffer, it sends an array of addresses and sizes. @@ -608,7 +613,10 @@ static unsigned next_desc(struct vring_desc *desc, return next; } -/* This actually sends the interrupt for this virtqueue */ +/* + * This actually sends the interrupt for this virtqueue, if we've used a + * buffer. + */ static void trigger_irq(struct virtqueue *vq) { unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; @@ -629,12 +637,12 @@ static void trigger_irq(struct virtqueue *vq) } /* - * This looks in the virtqueue and for the first available buffer, and converts + * This looks in the virtqueue for the first available buffer, and converts * it to an iovec for convenient access. Since descriptors consist of some * number of output then some number of input descriptors, it's actually two * iovecs, but we pack them into one and note how many of each there were. * - * This function returns the descriptor number found. + * This function waits if necessary, and returns the descriptor number found. */ static unsigned wait_for_vq_desc(struct virtqueue *vq, struct iovec iov[], @@ -644,10 +652,14 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, struct vring_desc *desc; u16 last_avail = lg_last_avail(vq); + /* There's nothing available? */ while (last_avail == vq->vring.avail->idx) { u64 event; - /* OK, tell Guest about progress up to now. */ + /* + * Since we're about to sleep, now is a good time to tell the + * Guest about what we've used up to now. + */ trigger_irq(vq); /* OK, now we need to know about added descriptors. */ @@ -734,8 +746,9 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, } /* - * After we've used one of their buffers, we tell them about it. We'll then - * want to send them an interrupt, using trigger_irq(). + * After we've used one of their buffers, we tell the Guest about it. Sometime + * later we'll want to send them an interrupt using trigger_irq(); note that + * wait_for_vq_desc() does that for us if it has to wait. */ static void add_used(struct virtqueue *vq, unsigned int head, int len) { @@ -782,12 +795,12 @@ static void console_input(struct virtqueue *vq) struct console_abort *abort = vq->dev->priv; struct iovec iov[vq->vring.num]; - /* Make sure there's a descriptor waiting. */ + /* Make sure there's a descriptor available. */ head = wait_for_vq_desc(vq, iov, &out_num, &in_num); if (out_num) errx(1, "Output buffers in console in queue?"); - /* Read it in. */ + /* Read into it. This is where we usually wait. */ len = readv(STDIN_FILENO, iov, in_num); if (len <= 0) { /* Ran out of input? */ @@ -800,6 +813,7 @@ static void console_input(struct virtqueue *vq) pause(); } + /* Tell the Guest we used a buffer. */ add_used_and_trigger(vq, head, len); /* @@ -834,15 +848,23 @@ static void console_output(struct virtqueue *vq) unsigned int head, out, in; struct iovec iov[vq->vring.num]; + /* We usually wait in here, for the Guest to give us something. */ head = wait_for_vq_desc(vq, iov, &out, &in); if (in) errx(1, "Input buffers in console output queue?"); + + /* writev can return a partial write, so we loop here. */ while (!iov_empty(iov, out)) { int len = writev(STDOUT_FILENO, iov, out); if (len <= 0) err(1, "Write to stdout gave %i", len); iov_consume(iov, out, len); } + + /* + * We're finished with that buffer: if we're going to sleep, + * wait_for_vq_desc() will prod the Guest with an interrupt. + */ add_used(vq, head, 0); } @@ -862,15 +884,30 @@ static void net_output(struct virtqueue *vq) unsigned int head, out, in; struct iovec iov[vq->vring.num]; + /* We usually wait in here for the Guest to give us a packet. */ head = wait_for_vq_desc(vq, iov, &out, &in); if (in) errx(1, "Input buffers in net output queue?"); + /* + * Send the whole thing through to /dev/net/tun. It expects the exact + * same format: what a coincidence! + */ if (writev(net_info->tunfd, iov, out) < 0) errx(1, "Write to tun failed?"); + + /* + * Done with that one; wait_for_vq_desc() will send the interrupt if + * all packets are processed. + */ add_used(vq, head, 0); } -/* Will reading from this file descriptor block? */ +/* + * Handling network input is a bit trickier, because I've tried to optimize it. + * + * First we have a helper routine which tells is if from this file descriptor + * (ie. the /dev/net/tun device) will block: + */ static bool will_block(int fd) { fd_set fdset; @@ -880,7 +917,11 @@ static bool will_block(int fd) return select(fd+1, &fdset, NULL, NULL, &zero) != 1; } -/* This handles packets coming in from the tun device to our Guest. */ +/* + * This handles packets coming in from the tun device to our Guest. Like all + * service routines, it gets called again as soon as it returns, so you don't + * see a while(1) loop here. + */ static void net_input(struct virtqueue *vq) { int len; @@ -888,21 +929,38 @@ static void net_input(struct virtqueue *vq) struct iovec iov[vq->vring.num]; struct net_info *net_info = vq->dev->priv; + /* + * Get a descriptor to write an incoming packet into. This will also + * send an interrupt if they're out of descriptors. + */ head = wait_for_vq_desc(vq, iov, &out, &in); if (out) errx(1, "Output buffers in net input queue?"); - /* Deliver interrupt now, since we're about to sleep. */ + /* + * If it looks like we'll block reading from the tun device, send them + * an interrupt. + */ if (vq->pending_used && will_block(net_info->tunfd)) trigger_irq(vq); + /* + * Read in the packet. This is where we normally wait (when there's no + * incoming network traffic). + */ len = readv(net_info->tunfd, iov, in); if (len <= 0) err(1, "Failed to read from tun."); + + /* + * Mark that packet buffer as used, but don't interrupt here. We want + * to wait until we've done as much work as we can. + */ add_used(vq, head, len); } +/*:*/ -/* This is the helper to create threads. */ +/* This is the helper to create threads: run the service routine in a loop. */ static int do_thread(void *_vq) { struct virtqueue *vq = _vq; @@ -950,11 +1008,14 @@ static void reset_device(struct device *dev) signal(SIGCHLD, (void *)kill_launcher); } +/*L:216 + * This actually creates the thread which services the virtqueue for a device. + */ static void create_thread(struct virtqueue *vq) { /* - * Create stack for thread and run it. Since the stack grows upwards, - * we point the stack pointer to the end of this region. + * Create stack for thread. Since the stack grows upwards, we point + * the stack pointer to the end of this region. */ char *stack = malloc(32768); unsigned long args[] = { LHREQ_EVENTFD, @@ -966,17 +1027,22 @@ static void create_thread(struct virtqueue *vq) err(1, "Creating eventfd"); args[2] = vq->eventfd; - /* Attach an eventfd to this virtqueue: it will go off - * when the Guest does an LHCALL_NOTIFY for this vq. */ + /* + * Attach an eventfd to this virtqueue: it will go off when the Guest + * does an LHCALL_NOTIFY for this vq. + */ if (write(lguest_fd, &args, sizeof(args)) != 0) err(1, "Attaching eventfd"); - /* CLONE_VM: because it has to access the Guest memory, and - * SIGCHLD so we get a signal if it dies. */ + /* + * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so + * we get a signal if it dies. + */ vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); if (vq->thread == (pid_t)-1) err(1, "Creating clone"); - /* We close our local copy, now the child has it. */ + + /* We close our local copy now the child has it. */ close(vq->eventfd); } @@ -1028,7 +1094,10 @@ static void update_device_status(struct device *dev) } } -/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ +/*L:215 + * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In + * particular, it's used to notify us of device status changes during boot. + */ static void handle_output(unsigned long addr) { struct device *i; @@ -1037,18 +1106,32 @@ static void handle_output(unsigned long addr) for (i = devices.dev; i; i = i->next) { struct virtqueue *vq; - /* Notifications to device descriptors update device status. */ + /* + * Notifications to device descriptors mean they updated the + * device status. + */ if (from_guest_phys(addr) == i->desc) { update_device_status(i); return; } - /* Devices *can* be used before status is set to DRIVER_OK. */ + /* + * Devices *can* be used before status is set to DRIVER_OK. + * The original plan was that they would never do this: they + * would always finish setting up their status bits before + * actually touching the virtqueues. In practice, we allowed + * them to, and they do (eg. the disk probes for partition + * tables as part of initialization). + * + * If we see this, we start the device: once it's running, we + * expect the device to catch all the notifications. + */ for (vq = i->vq; vq; vq = vq->next) { if (addr != vq->config.pfn*getpagesize()) continue; if (i->running) errx(1, "Notification on running %s", i->name); + /* This just calls create_thread() for each virtqueue */ start_device(i); return; } @@ -1132,6 +1215,11 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, vq->next = NULL; vq->last_avail_idx = 0; vq->dev = dev; + + /* + * This is the routine the service thread will run, and its Process ID + * once it's running. + */ vq->service = service; vq->thread = (pid_t)-1; @@ -1202,7 +1290,8 @@ static void set_config(struct device *dev, unsigned len, const void *conf) /* * This routine does all the creation and setup of a new device, including - * calling new_dev_desc() to allocate the descriptor and device memory. + * calling new_dev_desc() to allocate the descriptor and device memory. We + * don't actually start the service threads until later. * * See what I mean about userspace being boring? */ @@ -1478,19 +1567,7 @@ static void setup_tun_net(char *arg) verbose("device %u: tun %s: %s\n", devices.device_num, tapif, arg); } - -/* - * Our block (disk) device should be really simple: the Guest asks for a block - * number and we read or write that position in the file. Unfortunately, that - * was amazingly slow: the Guest waits until the read is finished before - * running anything else, even if it could have been doing useful work. - * - * We could use async I/O, except it's reputed to suck so hard that characters - * actually go missing from your code when you try to use it. - * - * So this was one reason why lguest now does all virtqueue servicing in - * separate threads: it's more efficient and more like a real device. - */ +/*:*/ /* This hangs off device->priv. */ struct vblk_info @@ -1512,8 +1589,16 @@ struct vblk_info /*L:210 * The Disk * - * Remember that the block device is handled by a separate I/O thread. We head - * straight into the core of that thread here: + * The disk only has one virtqueue, so it only has one thread. It is really + * simple: the Guest asks for a block number and we read or write that position + * in the file. + * + * Before we serviced each virtqueue in a separate thread, that was unacceptably + * slow: the Guest waits until the read is finished before running anything + * else, even if it could have been doing useful work. + * + * We could have used async I/O, except it's reputed to suck so hard that + * characters actually go missing from your code when you try to use it. */ static void blk_request(struct virtqueue *vq) { @@ -1525,7 +1610,10 @@ static void blk_request(struct virtqueue *vq) struct iovec iov[vq->vring.num]; off64_t off; - /* Get the next request. */ + /* + * Get the next request, where we normally wait. It triggers the + * interrupt to acknowledge previously serviced requests (if any). + */ head = wait_for_vq_desc(vq, iov, &out_num, &in_num); /* @@ -1539,6 +1627,10 @@ static void blk_request(struct virtqueue *vq) out = convert(&iov[0], struct virtio_blk_outhdr); in = convert(&iov[out_num+in_num-1], u8); + /* + * For historical reasons, block operations are expressed in 512 byte + * "sectors". + */ off = out->sector * 512; /* @@ -1614,6 +1706,7 @@ static void blk_request(struct virtqueue *vq) if (out->type & VIRTIO_BLK_T_BARRIER) fdatasync(vblk->fd); + /* Finished that request. */ add_used(vq, head, wlen); } @@ -1682,9 +1775,8 @@ static void rng_input(struct virtqueue *vq) errx(1, "Output buffers in rng?"); /* - * This is why we convert to iovecs: the readv() call uses them, and so - * it reads straight into the Guest's buffer. We loop to make sure we - * fill it. + * Just like the console write, we loop to cover the whole iovec. + * In this case, short reads actually happen quite a bit. */ while (!iov_empty(iov, in_num)) { len = readv(rng_info->rfd, iov, in_num); @@ -1818,7 +1910,9 @@ int main(int argc, char *argv[]) devices.lastdev = NULL; devices.next_irq = 1; + /* We're CPU 0. In fact, that's the only CPU possible right now. */ cpu_id = 0; + /* * We need to know how much memory so we can set up the device * descriptor and memory pages for the devices as we parse the command @@ -1926,7 +2020,7 @@ int main(int argc, char *argv[]) */ tell_kernel(start); - /* Ensure that we terminate if a child dies. */ + /* Ensure that we terminate if a device-servicing child dies. */ signal(SIGCHLD, kill_launcher); /* If we exit via err(), this kills all the threads, restores tty. */ diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index cceb73e12e50..ba0eed8aa1a6 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -35,10 +35,10 @@ * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * - * We use the KVM hypercall mechanism. Seventeen hypercalls are - * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. - * If a return value makes sense, it's returned in %eax. + * We use the KVM hypercall mechanism, though completely different hypercall + * numbers. Seventeen hypercalls are available: the hypercall number is put in + * the %eax register, and the arguments (when required) are placed in %ebx, + * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 025c04d18f2b..d677fa9ca650 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -154,6 +154,7 @@ static void lazy_hcall1(unsigned long call, async_hcall(call, arg1, 0, 0, 0); } +/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ static void lazy_hcall2(unsigned long call, unsigned long arg1, unsigned long arg2) @@ -189,8 +190,10 @@ static void lazy_hcall4(unsigned long call, } #endif -/* When lazy mode is turned off reset the per-cpu lazy mode variable and then - * issue the do-nothing hypercall to flush any stored calls. */ +/*G:036 + * When lazy mode is turned off reset the per-cpu lazy mode variable and then + * issue the do-nothing hypercall to flush any stored calls. +:*/ static void lguest_leave_lazy_mmu_mode(void) { kvm_hypercall0(LHCALL_FLUSH_ASYNC); @@ -250,13 +253,11 @@ extern void lg_irq_enable(void); extern void lg_restore_fl(unsigned long flags); /*M:003 - * Note that we don't check for outstanding interrupts when we re-enable them - * (or when we unmask an interrupt). This seems to work for the moment, since - * interrupts are rare and we'll just get the interrupt on the next timer tick, - * but now we can run with CONFIG_NO_HZ, we should revisit this. One way would - * be to put the "irq_enabled" field in a page by itself, and have the Host - * write-protect it when an interrupt comes in when irqs are disabled. There - * will then be a page fault as soon as interrupts are re-enabled. + * We could be more efficient in our checking of outstanding interrupts, rather + * than using a branch. One way would be to put the "irq_enabled" field in a + * page by itself, and have the Host write-protect it when an interrupt comes + * in when irqs are disabled. There will then be a page fault as soon as + * interrupts are re-enabled. * * A better method is to implement soft interrupt disable generally for x86: * instead of disabling interrupts, we set a flag. If an interrupt does come @@ -568,7 +569,7 @@ static void lguest_write_cr4(unsigned long val) * cr3 ---> +---------+ * | --------->+---------+ * | | | PADDR1 | - * Top-level | | PADDR2 | + * Mid-level | | PADDR2 | * (PMD) page | | | * | | Lower-level | * | | (PTE) page | @@ -588,23 +589,62 @@ static void lguest_write_cr4(unsigned long val) * Index into top Index into second Offset within page * page directory page pagetable page * - * The kernel spends a lot of time changing both the top-level page directory - * and lower-level pagetable pages. The Guest doesn't know physical addresses, - * so while it maintains these page tables exactly like normal, it also needs - * to keep the Host informed whenever it makes a change: the Host will create - * the real page tables based on the Guests'. + * Now, unfortunately, this isn't the whole story: Intel added Physical Address + * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). + * These are held in 64-bit page table entries, so we can now only fit 512 + * entries in a page, and the neat three-level tree breaks down. + * + * The result is a four level page table: + * + * cr3 --> [ 4 Upper ] + * [ Level ] + * [ Entries ] + * [(PUD Page)]---> +---------+ + * | --------->+---------+ + * | | | PADDR1 | + * Mid-level | | PADDR2 | + * (PMD) page | | | + * | | Lower-level | + * | | (PTE) page | + * | | | | + * .... .... + * + * + * And the virtual address is decoded as: + * + * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| + * Index into Index into mid Index into lower Offset within page + * top entries directory page pagetable page + * + * It's too hard to switch between these two formats at runtime, so Linux only + * supports one or the other depending on whether CONFIG_X86_PAE is set. Many + * distributions turn it on, and not just for people with silly amounts of + * memory: the larger PTE entries allow room for the NX bit, which lets the + * kernel disable execution of pages and increase security. + * + * This was a problem for lguest, which couldn't run on these distributions; + * then Matias Zabaljauregui figured it all out and implemented it, and only a + * handful of puppies were crushed in the process! + * + * Back to our point: the kernel spends a lot of time changing both the + * top-level page directory and lower-level pagetable pages. The Guest doesn't + * know physical addresses, so while it maintains these page tables exactly + * like normal, it also needs to keep the Host informed whenever it makes a + * change: the Host will create the real page tables based on the Guests'. */ /* - * The Guest calls this to set a second-level entry (pte), ie. to map a page - * into a process' address space. We set the entry then tell the Host the - * toplevel and address this corresponds to. The Guest uses one pagetable per - * process, so we need to tell the Host which one we're changing (mm->pgd). + * The Guest calls this after it has set a second-level entry (pte), ie. to map + * a page into a process' address space. Wetell the Host the toplevel and + * address this corresponds to. The Guest uses one pagetable per process, so + * we need to tell the Host which one we're changing (mm->pgd). */ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { #ifdef CONFIG_X86_PAE + /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low, ptep->pte_high); #else @@ -612,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, #endif } +/* This is the "set and update" combo-meal-deal version. */ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { @@ -672,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) } #ifdef CONFIG_X86_PAE +/* + * With 64-bit PTE values, we need to be careful setting them: if we set 32 + * bits at a time, the hardware could see a weird half-set entry. These + * versions ensure we update all 64 bits at once. + */ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) { native_set_pte_atomic(ptep, pte); @@ -679,13 +725,14 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) lazy_hcall1(LHCALL_FLUSH_TLB, 1); } -void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) { native_pte_clear(mm, addr, ptep); lguest_pte_update(mm, addr, ptep); } -void lguest_pmd_clear(pmd_t *pmdp) +static void lguest_pmd_clear(pmd_t *pmdp) { lguest_set_pmd(pmdp, __pmd(0)); } @@ -784,6 +831,14 @@ static void __init lguest_init_IRQ(void) irq_ctx_init(smp_processor_id()); } +/* + * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so + * rather than set them in lguest_init_IRQ we are called here every time an + * lguest device needs an interrupt. + * + * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should + * pass that up! + */ void lguest_setup_irq(unsigned int irq) { irq_to_desc_alloc_node(irq, 0); @@ -1298,7 +1353,7 @@ __init void lguest_init(void) */ switch_to_new_gdt(0); - /* As described in head_32.S, we map the first 128M of memory. */ + /* We actually boot with all memory mapped, but let's say 128MB. */ max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; /* diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index db6aa95eb054..27eac0faee48 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -102,6 +102,7 @@ send_interrupts: * create one manually here. */ .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ + /* Put eax back the way we found it. */ popl %eax ret @@ -125,6 +126,7 @@ ENTRY(lg_restore_fl) jnz send_interrupts /* Again, the normal path has used no extra registers. Clever, huh? */ ret +/*:*/ /* These demark the EIP range where host should never deliver interrupts. */ .global lguest_noirq_start diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index cd058bc903ff..1e2cb846b3c9 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -217,10 +217,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) /* * It's possible the Guest did a NOTIFY hypercall to the - * Launcher, in which case we return from the read() now. + * Launcher. */ if (cpu->pending_notify) { + /* + * Does it just needs to write to a registered + * eventfd (ie. the appropriate virtqueue thread)? + */ if (!send_notify_to_eventfd(cpu)) { + /* OK, we tell the main Laucher. */ if (put_user(cpu->pending_notify, user)) return -EFAULT; return sizeof(cpu->pending_notify); diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 787ab4bc09f0..83511eb0923d 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -59,7 +59,7 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) case LHCALL_SHUTDOWN: { char msg[128]; /* - * Shutdown is such a trivial hypercall that we do it in four + * Shutdown is such a trivial hypercall that we do it in five * lines right here. * * If the lgread fails, it will call kill_guest() itself; the @@ -245,6 +245,10 @@ static void initialize(struct lg_cpu *cpu) * device), the Guest will still see the old page. In practice, this never * happens: why would the Guest read a page which it has never written to? But * a similar scenario might one day bite us, so it's worth mentioning. + * + * Note that if we used a shared anonymous mapping in the Launcher instead of + * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we + * need that to switch the Launcher to processes (away from threads) anyway. :*/ /*H:100 diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index cc000e79c3d1..1401c1ace1ec 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -236,7 +236,7 @@ static void lg_notify(struct virtqueue *vq) extern void lguest_setup_irq(unsigned int irq); /* - * This routine finds the first virtqueue described in the configuration of + * This routine finds the Nth virtqueue described in the configuration of * this device and sets it up. * * This is kind of an ugly duckling. It'd be nicer to have a standard @@ -244,9 +244,6 @@ extern void lguest_setup_irq(unsigned int irq); * everyone wants to do it differently. The KVM coders want the Guest to * allocate its own pages and tell the Host where they are, but for lguest it's * simpler for the Host to simply tell us where the pages are. - * - * So we provide drivers with a "find the Nth virtqueue and set it up" - * function. */ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, unsigned index, @@ -422,7 +419,11 @@ static void add_lguest_device(struct lguest_device_desc *d, /* This devices' parent is the lguest/ dir. */ ldev->vdev.dev.parent = lguest_root; - /* We have a unique device index thanks to the dev_index counter. */ + /* + * The device type comes straight from the descriptor. There's also a + * device vendor field in the virtio_device struct, which we leave as + * 0. + */ ldev->vdev.id.device = d->type; /* * We have a simple set of routines for querying the device's diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 7e92017103dc..b4d3f7ca554f 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,9 +1,8 @@ -/*P:200 - * This contains all the /dev/lguest code, whereby the userspace launcher +/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher * controls and communicates with the Guest. For example, the first write will - * tell us the Guest's memory layout, pagetable, entry point and kernel address - * offset. A read will run the Guest until something happens, such as a signal - * or the Guest doing a NOTIFY out to the Launcher. + * tell us the Guest's memory layout and entry point. A read will run the + * Guest until something happens, such as a signal or the Guest doing a NOTIFY + * out to the Launcher. :*/ #include #include @@ -13,14 +12,41 @@ #include #include "lg.h" +/*L:056 + * Before we move on, let's jump ahead and look at what the kernel does when + * it needs to look up the eventfds. That will complete our picture of how we + * use RCU. + * + * The notification value is in cpu->pending_notify: we return true if it went + * to an eventfd. + */ bool send_notify_to_eventfd(struct lg_cpu *cpu) { unsigned int i; struct lg_eventfd_map *map; - /* lg->eventfds is RCU-protected */ + /* + * This "rcu_read_lock()" helps track when someone is still looking at + * the (RCU-using) eventfds array. It's not actually a lock at all; + * indeed it's a noop in many configurations. (You didn't expect me to + * explain all the RCU secrets here, did you?) + */ rcu_read_lock(); + /* + * rcu_dereference is the counter-side of rcu_assign_pointer(); it + * makes sure we don't access the memory pointed to by + * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy, + * but Alpha allows this! Paul McKenney points out that a really + * aggressive compiler could have the same effect: + * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html + * + * So play safe, use rcu_dereference to get the rcu-protected pointer: + */ map = rcu_dereference(cpu->lg->eventfds); + /* + * Simple array search: even if they add an eventfd while we do this, + * we'll continue to use the old array and just won't see the new one. + */ for (i = 0; i < map->num; i++) { if (map->map[i].addr == cpu->pending_notify) { eventfd_signal(map->map[i].event, 1); @@ -28,14 +54,43 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu) break; } } + /* We're done with the rcu-protected variable cpu->lg->eventfds. */ rcu_read_unlock(); + + /* If we cleared the notification, it's because we found a match. */ return cpu->pending_notify == 0; } +/*L:055 + * One of the more tricksy tricks in the Linux Kernel is a technique called + * Read Copy Update. Since one point of lguest is to teach lguest journeyers + * about kernel coding, I use it here. (In case you're curious, other purposes + * include learning about virtualization and instilling a deep appreciation for + * simplicity and puppies). + * + * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we + * add new eventfds without ever blocking readers from accessing the array. + * The current Launcher only does this during boot, so that never happens. But + * Read Copy Update is cool, and adding a lock risks damaging even more puppies + * than this code does. + * + * We allocate a brand new one-larger array, copy the old one and add our new + * element. Then we make the lg eventfd pointer point to the new array. + * That's the easy part: now we need to free the old one, but we need to make + * sure no slow CPU somewhere is still looking at it. That's what + * synchronize_rcu does for us: waits until every CPU has indicated that it has + * moved on to know it's no longer using the old one. + * + * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update. + */ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) { struct lg_eventfd_map *new, *old = lg->eventfds; + /* + * We don't allow notifications on value 0 anyway (pending_notify of + * 0 means "nothing pending"). + */ if (!addr) return -EINVAL; @@ -62,12 +117,20 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) } new->num++; - /* Now put new one in place. */ + /* + * Now put new one in place: rcu_assign_pointer() is a fancy way of + * doing "lg->eventfds = new", but it uses memory barriers to make + * absolutely sure that the contents of "new" written above is nailed + * down before we actually do the assignment. + * + * We have to think about these kinds of things when we're operating on + * live data without locks. + */ rcu_assign_pointer(lg->eventfds, new); /* * We're not in a big hurry. Wait until noone's looking at old - * version, then delete it. + * version, then free it. */ synchronize_rcu(); kfree(old); @@ -75,6 +138,14 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) return 0; } +/*L:052 + * Receiving notifications from the Guest is usually done by attaching a + * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will + * become readable when the Guest does an LHCALL_NOTIFY with that value. + * + * This is really convenient for processing each virtqueue in a separate + * thread. + */ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) { unsigned long addr, fd; @@ -86,6 +157,11 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) if (get_user(fd, input) != 0) return -EFAULT; + /* + * Just make sure two callers don't add eventfds at once. We really + * only need to lock against callers adding to the same Guest, so using + * the Big Lguest Lock is overkill. But this is setup, not a fast path. + */ mutex_lock(&lguest_lock); err = add_eventfd(lg, addr, fd); mutex_unlock(&lguest_lock); @@ -106,6 +182,10 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) if (irq >= LGUEST_IRQS) return -EINVAL; + /* + * Next time the Guest runs, the core code will see if it can deliver + * this interrupt. + */ set_interrupt(cpu, irq); return 0; } @@ -307,10 +387,10 @@ unlock: * The first operation the Launcher does must be a write. All writes * start with an unsigned long number: for the first write this must be * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use - * writes of other values to send interrupts. + * writes of other values to send interrupts or set up receipt of notifications. * * Note that we overload the "offset" in the /dev/lguest file to indicate what - * CPU number we're dealing with. Currently this is always 0, since we only + * CPU number we're dealing with. Currently this is always 0 since we only * support uniprocessor Guests, but you can see the beginnings of SMP support * here. */ diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 3da902e4b4cb..a8d0aee3bc0e 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -29,10 +29,10 @@ /*H:300 * The Page Table Code * - * We use two-level page tables for the Guest. If you're not entirely - * comfortable with virtual addresses, physical addresses and page tables then - * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with - * diagrams!). + * We use two-level page tables for the Guest, or three-level with PAE. If + * you're not entirely comfortable with virtual addresses, physical addresses + * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page + * Table Handling" (with diagrams!). * * The Guest keeps page tables, but we maintain the actual ones here: these are * called "shadow" page tables. Which is a very Guest-centric name: these are @@ -52,9 +52,8 @@ :*/ /* - * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is - * conveniently placed at the top 4MB, so it uses a separate, complete PTE - * page. + * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) + * or 512 PTE entries with PAE (2MB). */ #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) @@ -81,7 +80,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); /*H:320 * The page table code is curly enough to need helper functions to keep it - * clear and clean. + * clear and clean. The kernel itself provides many of them; one advantage + * of insisting that the Guest and Host use the same CONFIG_PAE setting. * * There are two functions which return pointers to the shadow (aka "real") * page tables. @@ -155,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) } /* - * These two functions just like the above two, except they access the Guest + * These functions are just like the above two, except they access the Guest * page tables. Hence they return a Guest address. */ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) @@ -165,6 +165,7 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) } #ifdef CONFIG_X86_PAE +/* Follow the PGD to the PMD. */ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) { unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; @@ -172,6 +173,7 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) return gpage + pmd_index(vaddr) * sizeof(pmd_t); } +/* Follow the PMD to the PTE. */ static unsigned long gpte_addr(struct lg_cpu *cpu, pmd_t gpmd, unsigned long vaddr) { @@ -181,6 +183,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, return gpage + pte_index(vaddr) * sizeof(pte_t); } #else +/* Follow the PGD to the PTE (no mid-level for !PAE). */ static unsigned long gpte_addr(struct lg_cpu *cpu, pgd_t gpgd, unsigned long vaddr) { @@ -314,6 +317,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) pte_t gpte; pte_t *spte; + /* Mid level for PAE. */ #ifdef CONFIG_X86_PAE pmd_t *spmd; pmd_t gpmd; @@ -391,6 +395,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) */ gpte_ptr = gpte_addr(cpu, gpgd, vaddr); #endif + + /* Read the actual PTE value. */ gpte = lgread(cpu, gpte_ptr, pte_t); /* If this page isn't in the Guest page tables, we can't page it in. */ @@ -507,6 +513,7 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) kill_guest(cpu, "bad stack page %#lx", vaddr); } +/*:*/ #ifdef CONFIG_X86_PAE static void release_pmd(pmd_t *spmd) @@ -543,7 +550,11 @@ static void release_pgd(pgd_t *spgd) } #else /* !CONFIG_X86_PAE */ -/*H:450 If we chase down the release_pgd() code, it looks like this: */ +/*H:450 + * If we chase down the release_pgd() code, the non-PAE version looks like + * this. The PAE version is almost identical, but instead of calling + * release_pte it calls release_pmd(), which looks much like this. + */ static void release_pgd(pgd_t *spgd) { /* If the entry's not present, there's nothing to release. */ @@ -898,17 +909,21 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) /* ... throw it away. */ release_pgd(lg->pgdirs[pgdir].pgdir + idx); } + #ifdef CONFIG_X86_PAE +/* For setting a mid-level, we just throw everything away. It's easy. */ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) { guest_pagetable_clear_all(&lg->cpus[0]); } #endif -/* - * Once we know how much memory we have we can construct simple identity (which +/*H:505 + * To get through boot, we construct simple identity page mappings (which * set virtual == physical) and linear mappings which will get the Guest far - * enough into the boot to create its own. + * enough into the boot to create its own. The linear mapping means we + * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, + * as you'll see. * * We lay them out of the way, just below the initrd (which is why we need to * know its size here). @@ -944,6 +959,10 @@ static unsigned long setup_pagetables(struct lguest *lg, linear = (void *)pgdir - linear_pages * PAGE_SIZE; #ifdef CONFIG_X86_PAE + /* + * And the single mid page goes below that. We only use one, but + * that's enough to map 1G, which definitely gets us through boot. + */ pmds = (void *)linear - PAGE_SIZE; #endif /* @@ -957,13 +976,14 @@ static unsigned long setup_pagetables(struct lguest *lg, return -EFAULT; } +#ifdef CONFIG_X86_PAE /* - * The top level points to the linear page table pages above. - * We setup the identity and linear mappings here. + * Make the Guest PMD entries point to the corresponding place in the + * linear mapping (up to one page worth of PMD). */ -#ifdef CONFIG_X86_PAE for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; i += PTRS_PER_PTE, j++) { + /* FIXME: native_set_pmd is overkill here. */ native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); @@ -971,18 +991,36 @@ static unsigned long setup_pagetables(struct lguest *lg, return -EFAULT; } + /* One PGD entry, pointing to that PMD page. */ set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); + /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) return -EFAULT; + /* + * And the third PGD entry (ie. addresses 3G-4G). + * + * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000. + */ if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) return -EFAULT; #else + /* + * The top level points to the linear page table pages above. + * We setup the identity and linear mappings here. + */ phys_linear = (unsigned long)linear - mem_base; for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { pgd_t pgd; + /* + * Create a PGD entry which points to the right part of the + * linear PTE pages. + */ pgd = __pgd((phys_linear + i * sizeof(pte_t)) | (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); + /* + * Copy it into the PGD page at 0 and PAGE_OFFSET. + */ if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) + i / PTRS_PER_PTE], @@ -992,8 +1030,8 @@ static unsigned long setup_pagetables(struct lguest *lg, #endif /* - * We return the top level (guest-physical) address: remember where - * this is. + * We return the top level (guest-physical) address: we remember where + * this is to write it into lguest_data when the Guest initializes. */ return (unsigned long)pgdir - mem_base; } @@ -1031,7 +1069,9 @@ int init_guest_pagetable(struct lguest *lg) lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); if (!lg->pgdirs[0].pgdir) return -ENOMEM; + #ifdef CONFIG_X86_PAE + /* For PAE, we also create the initial mid-level. */ pgd = lg->pgdirs[0].pgdir; pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); if (!pmd_table) @@ -1040,11 +1080,13 @@ int init_guest_pagetable(struct lguest *lg) set_pgd(pgd + SWITCHER_PGD_INDEX, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); #endif + + /* This is the current page table. */ lg->cpus[0].cpu_pgd = 0; return 0; } -/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ +/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ void page_table_guest_data_init(struct lg_cpu *cpu) { /* We get the kernel address: above this is all kernel memory. */ @@ -1105,12 +1147,16 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) pmd_t switcher_pmd; pmd_t *pmd_table; + /* FIXME: native_set_pmd is overkill here. */ native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); + /* Figure out where the pmd page is, by reading the PGD, and converting + * it to a virtual address. */ pmd_table = __va(pgd_pfn(cpu->lg-> pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) << PAGE_SHIFT); + /* Now write it into the shadow page table. */ native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); #else pgd_t switcher_pgd; diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 96f7d88ec7f8..6ae388849a3b 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -187,7 +187,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) * also simplify copy_in_guest_info(). Note that we'd still need to restore * things when we exit to Launcher userspace, but that's fairly easy. * - * We could also try using this hooks for PGE, but that might be too expensive. + * We could also try using these hooks for PGE, but that might be too expensive. * * The hooks were designed for KVM, but we can also put them to good use. :*/ diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S index 6dec09793836..40634b0db9f7 100644 --- a/drivers/lguest/x86/switcher_32.S +++ b/drivers/lguest/x86/switcher_32.S @@ -1,7 +1,7 @@ /*P:900 - * This is the Switcher: code which sits at 0xFFC00000 astride both the - * Host and Guest to do the low-level Guest<->Host switch. It is as simple as - * it can be made, but it's naturally very specific to x86. + * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride + * both the Host and Guest to do the low-level Guest<->Host switch. It is as + * simple as it can be made, but it's naturally very specific to x86. * * You have now completed Preparation. If this has whet your appetite; if you * are feeling invigorated and refreshed then the next, more challenging stage -- cgit v1.2.3