22 files changed, 3717 insertions, 410 deletions
diff --git a/efi_memtest/Makefile b/efi_memtest/Makefile
index c351d0c..7451619 100644
--- a/efi_memtest/Makefile
+++ b/efi_memtest/Makefile
@@ -90,12 +90,17 @@ AutoGen.obj: memtest86+/efi/Include/AutoGen.c
 	$(CC) $(CFLAGS) $(PREPROCESSOR) $(M) -c -o $@ $< \
 	-I"memtest86+/efi"
 
+%.o: memtest86+/efi/%.c
+	$(CC) $(CFLAGS) $(PREPROCESSOR) $(M) -c -o $@ $< \
+	-I"memtest86+" \
+	-I"memtest86+/efi"
+
 clean:
 	rm -f OUTPUT/*
 	rm -f memtest86+/*.o
 	rm -f *.o
 	rm -f MemtestEfi.obj
-	rm MemtestEfi.map
+	rm -f MemtestEfi.map
 
 
 move:
diff --git a/efi_memtest/MemtestEfi.c b/efi_memtest/MemtestEfi.c
index c9e44a0..b847aaa 100644
--- a/efi_memtest/MemtestEfi.c
+++ b/efi_memtest/MemtestEfi.c
@@ -16,7 +16,7 @@ UefiMain (
 {
   
   Print(L"MemtestEfi started\n");
-  //test_start();
+  test_start();
 
   return EFI_SUCCESS;
 }
diff --git a/efi_memtest/memtest86+/cpuid.h b/efi_memtest/memtest86+/bios/cpuid.h
index 0feb56e..0feb56e 100644
--- a/efi_memtest/memtest86+/cpuid.h
+++ b/efi_memtest/memtest86+/bios/cpuid.h
diff --git a/efi_memtest/memtest86+/init.c b/efi_memtest/memtest86+/bios/init.c
index 32bff7f..32bff7f 100644
--- a/efi_memtest/memtest86+/init.c
+++ b/efi_memtest/memtest86+/bios/init.c
diff --git a/efi_memtest/logger.h b/efi_memtest/memtest86+/bios/logger.h
index e69de29..e69de29 100644
--- a/efi_memtest/logger.h
+++ b/efi_memtest/memtest86+/bios/logger.h
diff --git a/efi_memtest/memtest86+/bios/main_asm.h b/efi_memtest/memtest86+/bios/main_asm.h
new file mode 100644
index 0000000..8e6efbc
--- /dev/null
+++ b/efi_memtest/memtest86+/bios/main_asm.h
@@ -0,0 +1,49 @@
+static inline void enable_fp_processing(void) {
+	   if (cpu_id.fid.bits.fpu)
+            __asm__ __volatile__
+                (
+                 "movl %%cr0, %%eax\n\t"
+                 "andl $0x7, %%eax\n\t"
+                 "movl %%eax, %%cr0\n\t"
+                 : :
+                 : "ax"
+                 );
+        if (cpu_id.fid.bits.sse)
+            __asm__ __volatile__
+                (
+                 "movl %%cr4, %%eax\n\t"
+                 "orl $0x00000200, %%eax\n\t"
+                 "movl %%eax, %%cr4\n\t"
+                 : :
+                 : "ax"
+                 );
+
+}
+
+static inline void setup_mm_modes(void) {
+	        /* If we have PAE, turn it on */
+        if (cpu_id.fid.bits.pae == 1) {
+            __asm__ __volatile__
+                (
+                 "movl %%cr4, %%eax\n\t"
+                 "orl $0x00000020, %%eax\n\t"
+                 "movl %%eax, %%cr4\n\t"
+                 : :
+                 : "ax"
+                 );
+            cprint(LINE_TITLE+1, COL_MODE, "(PAE Mode)");
+        }
+        /* If this is a 64 CPU enable long mode */
+        if (cpu_id.fid.bits.lm == 1) {
+            __asm__ __volatile__
+                (
+                 "movl $0xc0000080, %%ecx\n\t"
+                 "rdmsr\n\t"
+                 "orl $0x00000100, %%eax\n\t"
+                 "wrmsr\n\t"
+                 : :
+                 : "ax", "cx"
+                 );
+            cprint(LINE_TITLE+1, COL_MODE, "(X64 Mode)");
+        }
+}
+\ No newline at end of file
diff --git a/efi_memtest/memtest86+/test.c b/efi_memtest/memtest86+/bios/test.c
index 864dfcc..864dfcc 100644
--- a/efi_memtest/memtest86+/test.c
+++ b/efi_memtest/memtest86+/bios/test.c
diff --git a/efi_memtest/memtest86+/bios/test_cache.h b/efi_memtest/memtest86+/bios/test_cache.h
new file mode 100644
index 0000000..48b4869
--- /dev/null
+++ b/efi_memtest/memtest86+/bios/test_cache.h
@@ -0,0 +1,20 @@
+static inline void cache_off(void)
+{
+    asm(
+        "push %eax\n\t"
+        "movl %cr0,%eax\n\t"
+        "orl $0x40000000,%eax\n\t"  /* Set CD */
+        "movl %eax,%cr0\n\t"
+        "wbinvd\n\t"
+        "pop  %eax\n\t");
+}
+
+static inline void cache_on(void)
+{
+    asm(
+        "push %eax\n\t"
+        "movl %cr0,%eax\n\t"
+        "andl $0x9fffffff,%eax\n\t" /* Clear CD and NW */ 
+        "movl %eax,%cr0\n\t"
+        "pop  %eax\n\t");
+}
diff --git a/efi_memtest/memtest86+/vmem.c b/efi_memtest/memtest86+/bios/vmem.c
index 6125e0d..6125e0d 100644
--- a/efi_memtest/memtest86+/vmem.c
+++ b/efi_memtest/memtest86+/bios/vmem.c
diff --git a/efi_memtest/memtest86+/efi/cpuid.h b/efi_memtest/memtest86+/efi/cpuid.h
new file mode 100644
index 0000000..19e2d51
--- /dev/null
+++ b/efi_memtest/memtest86+/efi/cpuid.h
@@ -0,0 +1,205 @@
+
+
+#ifndef CPUID_H_
+#define CPUID_H_
+
+
+/*
+ * cpuid.h --
+ *      contains the data structures required for CPUID 
+ *      implementation.
+ */
+
+#define CPUID_VENDOR_LENGTH     3               /* 3 GPRs hold vendor ID */
+#define CPUID_VENDOR_STR_LENGTH (CPUID_VENDOR_LENGTH * sizeof(uint32_t) + 1)
+#define CPUID_BRAND_LENGTH      12              /* 12 GPRs hold vendor ID */
+#define CPUID_BRAND_STR_LENGTH  (CPUID_BRAND_LENGTH * sizeof(uint32_t) + 1)
+
+extern struct cpu_ident cpu_id;
+
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+                           unsigned int *ecx, unsigned int *edx)
+{
+        /* ecx is often an input as well as an output. */
+        asm volatile("\t"
+      	    "push %%rbx; cpuid; mov %%ebx, %%edi; pop %%rbx"
+            : "=a" (*eax),
+              "=D" (*ebx),
+              "=c" (*ecx),
+              "=d" (*edx)
+            : "0" (*eax), "2" (*ecx));
+}
+
+static inline void cpuid(unsigned int op,
+                         unsigned int *eax, unsigned int *ebx,
+                         unsigned int *ecx, unsigned int *edx)
+{
+        *eax = op;
+        *ecx = 0;
+        __cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+static inline void cpuid_count(unsigned int op, int count,
+                               unsigned int *eax, unsigned int *ebx,
+                               unsigned int *ecx, unsigned int *edx)
+{
+        *eax = op;
+        *ecx = count;
+        __cpuid(eax, ebx, ecx, edx);
+}
+
+/* Typedef for storing the Cache Information */
+typedef union {
+   unsigned char ch[48];
+   uint32_t      uint[12];
+   struct {
+      uint32_t    fill1:24;      /* Bit 0 */
+      uint32_t    l1_i_sz:8;
+      uint32_t    fill2:24; 
+      uint32_t    l1_d_sz:8;
+      uint32_t    fill3:16; 
+      uint32_t    l2_sz:16;
+      uint32_t    fill4:18; 
+      uint32_t    l3_sz:14;
+      uint32_t    fill5[8];
+   } amd;
+} cpuid_cache_info_t;
+
+/* Typedef for storing the CPUID Vendor String */
+typedef union {
+   /* Note: the extra byte in the char array is for '\0'. */
+   char           char_array[CPUID_VENDOR_STR_LENGTH];
+   uint32_t       uint32_array[CPUID_VENDOR_LENGTH];
+} cpuid_vendor_string_t;
+
+/* Typedef for storing the CPUID Brand String */
+typedef union {
+   /* Note: the extra byte in the char array is for '\0'. */
+   char           char_array[CPUID_BRAND_STR_LENGTH];
+   uint32_t       uint32_array[CPUID_BRAND_LENGTH];
+} cpuid_brand_string_t;
+
+/* Typedef for storing CPUID Version */
+typedef union {
+   uint32_t flat;
+   struct {
+      uint32_t    stepping:4;      /* Bit 0 */
+      uint32_t    model:4;
+      uint32_t    family:4;
+      uint32_t    processorType:2;
+      uint32_t    reserved1514:2;
+      uint32_t    extendedModel:4;
+      uint32_t    extendedFamily:8;
+      uint32_t    reserved3128:4;  /* Bit 31 */
+   } bits;      
+} cpuid_version_t;
+
+/* Typedef for storing CPUID Processor Information */
+typedef union {
+   uint32_t flat;
+   struct {
+      uint32_t    brandIndex:8;    /* Bit 0 */
+      uint32_t    cflushLineSize:8;
+      uint32_t    logicalProcessorCount:8;
+      uint32_t    apicID:8;        /* Bit 31 */
+   } bits;      
+} cpuid_proc_info_t;
+
+/* Typedef for storing CPUID Feature flags */
+typedef union {
+   uint32_t flat;
+   struct {
+      uint32_t    :1;           
+   } bits;
+} cpuid_custom_features;
+
+/* Typedef for storing CPUID Feature flags */
+typedef union {
+   uint32_t       uint32_array[3];
+   struct {
+      uint32_t    fpu:1;           /* EDX feature flags, bit 0 */
+      uint32_t    vme:1;
+      uint32_t    de:1;
+      uint32_t    pse:1;
+      uint32_t    rdtsc:1;
+      uint32_t    msr:1;
+      uint32_t    pae:1;
+      uint32_t    mce:1;
+      uint32_t    cx8:1;
+      uint32_t    apic:1;
+      uint32_t    bit10:1;
+      uint32_t    sep:1;
+      uint32_t    mtrr:1;
+      uint32_t    pge:1;
+      uint32_t    mca:1;
+      uint32_t    cmov:1;
+      uint32_t    pat:1;
+      uint32_t    pse36:1;
+      uint32_t    psn:1;
+      uint32_t    cflush:1;
+      uint32_t    bit20:1;
+      uint32_t    ds:1;
+      uint32_t    acpi:1;
+      uint32_t    mmx:1;
+      uint32_t    fxsr:1;
+      uint32_t    sse:1;
+      uint32_t    sse2:1;
+      uint32_t    ss:1;
+      uint32_t    htt:1;
+      uint32_t    tm:1;
+      uint32_t    bit30:1;
+      uint32_t    pbe:1;           /* EDX feature flags, bit 31 */
+      uint32_t    sse3:1;          /* ECX feature flags, bit 0 */
+      uint32_t    mulq:1;
+      uint32_t    bit2:1;
+      uint32_t    mon:1;
+      uint32_t    dscpl:1;
+      uint32_t    vmx:1;
+      uint32_t    smx:1;      	
+      uint32_t    eist:1;  
+      uint32_t    tm2:1;       		     		
+      uint32_t    bits_9_31:23;
+      uint32_t    bits0_28:29;     /* EDX extended feature flags, bit 0 */
+      uint32_t    lm:1;		   /* Long Mode */
+      uint32_t    bits_30_31:2;    /* EDX extended feature flags, bit 32 */
+   } bits;
+} cpuid_feature_flags_t;
+
+/* An overall structure to cache all of the CPUID information */
+struct cpu_ident {
+	uint32_t max_cpuid;
+	uint32_t max_xcpuid;
+	uint32_t dts_pmp;
+	cpuid_version_t vers;
+	cpuid_proc_info_t info;
+	cpuid_feature_flags_t fid;
+	cpuid_vendor_string_t vend_id;
+	cpuid_brand_string_t brand_id;
+	cpuid_cache_info_t cache_info;
+	cpuid_custom_features custom;
+};
+
+struct cpuid4_eax {
+	uint32_t	ctype:5;
+	uint32_t	level:3;
+	uint32_t	is_self_initializing:1;
+	uint32_t	is_fully_associative:1;
+	uint32_t	reserved:4;
+	uint32_t	num_threads_sharing:12;
+	uint32_t	num_cores_on_die:6;
+};
+
+struct cpuid4_ebx {
+	uint32_t	coherency_line_size:12;
+	uint32_t	physical_line_partition:10;
+	uint32_t	ways_of_associativity:10;
+};
+
+struct cpuid4_ecx {
+	uint32_t	number_of_sets:32;
+};
+
+void get_cpuid();
+
+#endif  // CPUID_H_
diff --git a/efi_memtest/memtest86+/efi/init.c b/efi_memtest/memtest86+/efi/init.c
new file mode 100644
index 0000000..6388443
--- /dev/null
+++ b/efi_memtest/memtest86+/efi/init.c
@@ -0,0 +1,1297 @@
+/*
+ * MemTest86+ V5 Specific code (GPL V2.0)
+ * By Samuel DEMEULEMEESTER, sdemeule@memtest.org
+ * http://www.canardpc.com - http://www.memtest.org
+ * ------------------------------------------------
+ * init.c - MemTest-86  Version 3.6
+ *
+ * Released under version 2 of the Gnu Public License.
+ * By Chris Brady
+ */
+ 
+
+#include "stdin.h"
+#include "stddef.h"
+#include "test.h"
+#include "defs.h"
+#include "config.h"
+#include "cpuid.h"
+#include "smp.h"
+#include "io.h"
+#include "spd.h"
+#include "pci.h"
+#include "controller.h"
+
+extern struct tseq tseq[];
+extern short memsz_mode;
+extern int num_cpus;
+extern int act_cpus;
+extern int found_cpus;
+unsigned long imc_type = 0;
+extern int maxcpus;
+extern char cpu_mask[];
+extern void initialise_cpus();
+
+/* Here we store all of the cpuid data */
+extern struct cpu_ident cpu_id;
+
+int l1_cache=0, l2_cache=0, l3_cache=0;
+int tsc_invariable = 0;
+ulong extclock;
+
+ulong memspeed(ulong src, ulong len, int iter);
+static void cpu_type(void);
+static int cpuspeed(void);
+static void get_cache_size();
+static void cpu_cache_speed();
+void get_cpuid();
+int beepmode;
+extern short dmi_initialized;
+extern int dmi_err_cnts[MAX_DMI_MEMDEVS];
+
+/* Failsafe function */
+/* msec: number of ms to wait - scs: scancode expected to stop */
+/* bits: 0 = extended detection - 1: SMP - 2: Temp Check */
+/* 3: MP SMP - 4-7: RSVD */
+void failsafe(int msec, int scs)
+{
+    int i;
+    ulong sh, sl, l, h, t;
+    unsigned char c;
+    volatile char *pp;
+	
+    for(i=0, pp=(char *)(SCREEN_ADR+(18*160)+(18*2)+1); i<40; i++, pp+=2) {
+        *pp = 0x1E;
+    }	
+    for(i=0, pp=(char *)(SCREEN_ADR+(18*160)+(18*2)+1); i<3; i++, pp+=2) {
+        *pp = 0x9E;
+    }	
+    for(i=0, pp=(char *)(SCREEN_ADR+(18*160)+(55*2)+1); i<3; i++, pp+=2) {
+        *pp = 0x9E;
+    }	
+
+    cprint(18, 18, "==> Press F1 to enter Fail-Safe Mode <==");
+	
+    if(vv->fail_safe & 2)
+    {
+        cprint(19, 15, "==> Press F2 to force Multi-Threading (SMP) <==");				
+    }
+
+    /* save the starting time */
+    asm __volatile__
+        ("rdtsc":"=a" (sl),"=d" (sh));
+
+    /* loop for n seconds */
+    while (1) {
+      /*  asm __volatile__(
+                         "rdtsc":"=a" (l),"=d" (h));
+        asm __volatile__ (
+                          "subl %2,%0\n\t"
+                          "sbbl %3,%1"
+                          :"=a" (l), "=d" (h)
+                          :"g" (sl), "g" (sh),
+                           "0" (l), "1" (h));*/
+    	h = 1; // TODO remove
+    	l = 1; // TODO remove
+        t = h * ((unsigned)0xffffffff / vv->clks_msec);
+        t += (l / vv->clks_msec);
+
+        /* Is the time up? */
+        if (t >= msec) { break;	}
+		
+        /* Is expected Scan code pressed? */
+        c = get_key();
+        c &= 0x7f;
+		
+        /* F1 */
+        if(c == scs) { vv->fail_safe |= 1;	break; }
+					
+        /* F2 */
+        if(c == scs+1) 
+        { 
+            vv->fail_safe ^= 2;
+            break;
+            
+        }
+		
+        /* F3 */
+        if(c == scs+2) 
+        { 
+            if(vv->fail_safe & 2) { vv->fail_safe ^= 2; }
+            vv->fail_safe |= 8;
+            break;
+        }
+    }
+
+    cprint(18, 18, "                                          ");
+    cprint(19, 15, "                                                ");
+
+    for(i=0, pp=(char *)(SCREEN_ADR+(18*160)+(18*2)+1); i<40; i++, pp+=2) {
+        *pp = 0x17;
+    }
+}
+
+static void display_init(void)
+{
+    int i;
+    volatile char *pp;
+	
+    /* Set HW cursor out of screen boundaries */
+    __outb(0x0F, 0x03D4);
+    __outb(0xFF, 0x03D5);
+
+    __outb(0x0E, 0x03D4);
+    __outb(0xFF, 0x03D5);
+
+
+    serial_echo_init();
+    serial_echo_print("[LINE_SCROLL;24r"); /* Set scroll area row 7-23 */
+    serial_echo_print("[H[2J");   /* Clear Screen */
+    serial_echo_print("[37m[44m");
+    serial_echo_print("[0m");
+    serial_echo_print("[37m[44m");
+
+    /* Clear screen & set background to blue */
+    for(i=0, pp=(char *)(SCREEN_ADR); i<80*24; i++) {
+        *pp++ = ' ';
+        *pp++ = 0x17;
+    }
+
+    /* Make the name background green */
+    for(i=0, pp=(char *)(SCREEN_ADR+1); i<TITLE_WIDTH; i++, pp+=2) {
+        *pp = 0x20;
+    }
+    cprint(0, 0, "      Memtest86  5.31b       ");
+
+    /* Set Blinking "+" */
+    for(i=0, pp=(char *)(SCREEN_ADR+1); i<2; i++, pp+=30) {
+        *pp = 0xA4;
+    }
+    cprint(0, 15, "+");
+
+    /* Do reverse video for the bottom display line */
+    for(i=0, pp=(char *)(SCREEN_ADR+1+(24 * 160)); i<80; i++, pp+=2) {
+        *pp = 0x71;
+    }
+
+    serial_echo_print("[0m");
+}
+
+/*
+ * Initialize test, setup screen and find out how much memory there is.
+ */
+void init(void)
+{
+    int i;
+	
+    outb(0x8, 0x3f2);  /* Kill Floppy Motor */
+
+    /* Turn on cache */
+    set_cache(1);
+
+    /* Setup the display */
+    display_init();
+	
+    cprint(5, 60, "| Time:   0:00:00");
+    cprint(1, COL_MID,"Pass   %");
+    cprint(2, COL_MID,"Test   %");
+    cprint(3, COL_MID,"Test #");
+    cprint(4, COL_MID,"Testing: ");
+    cprint(5, COL_MID,"Pattern: ");
+    cprint(1, 0, "CLK:           (32b Mode)");
+    cprint(2, 0, "L1 Cache: Unknown ");
+    cprint(3, 0, "L2 Cache: Unknown ");
+    cprint(4, 0, "L3 Cache:  None    ");
+    cprint(5, 0, "Memory  :         ");
+    cprint(6, 0, "------------------------------------------------------------------------------");
+    cprint(7, 0, "Core#:");
+    cprint(8, 0, "State:");
+    cprint(9, 0, "Cores:    Active /    Total (Run: All) | Pass:       0        Errors:      0  ");
+    cprint(10, 0, "------------------------------------------------------------------------------");
+
+    /*	
+	for(i=0, pp=(char *)(SCREEN_ADR+(5*160)+(53*2)+1); i<20; i++, pp+=2) {
+        *pp = 0x92;
+	}
+        
+	for(i=0, pp=(char *)(SCREEN_ADR+0*160+1); i<80; i++, pp+=2) {
+        *pp = 0x47;
+	}
+    */
+	
+    cprint(7, 39, "| Chipset : Unknown");
+    cprint(8, 39, "| Memory Type : Unknown");
+
+    for(i=0; i < 6; i++) {
+        cprint(i, COL_MID-2, "| ");
+    }
+	
+    footer();
+
+    aprint(5, 10, vv->test_pages);
+
+    vv->pass = 0;
+    vv->msg_line = 0;
+    vv->ecount = 0;
+    vv->ecc_ecount = 0;
+    vv->testsel = -1;
+    vv->msg_line = LINE_SCROLL-1;
+    vv->scroll_start = vv->msg_line * 160;
+    vv->erri.low_addr.page = 0x7fffffff;
+    vv->erri.low_addr.offset = 0xfff;
+    vv->erri.high_addr.page = 0;
+    vv->erri.high_addr.offset = 0;
+    vv->erri.min_bits = 32;
+    vv->erri.max_bits = 0;
+    vv->erri.min_bits = 32;
+    vv->erri.max_bits = 0;
+    vv->erri.maxl = 0;
+    vv->erri.cor_err = 0;
+    vv->erri.ebits = 0;
+    vv->erri.hdr_flag = 0;
+    vv->erri.tbits = 0;
+    for (i=0; tseq[i].msg != NULL; i++) {
+        tseq[i].errors = 0;
+    }
+    if (dmi_initialized) {
+        for (i=0; i < MAX_DMI_MEMDEVS; i++){
+            if (dmi_err_cnts[i] > 0) {
+                dmi_err_cnts[i] = 0;
+            }
+        }
+    }
+	
+    /* setup beep mode */
+    beepmode = BEEP_MODE;
+
+    /* Get the cpu and cache information */
+    get_cpuid();
+
+    /* setup pci */
+    pci_init(); 
+
+    get_cache_size(); 
+
+    cpu_type();
+
+    cpu_cache_speed();
+
+    /* Check fail safe */	
+    failsafe(5000, 0x3B);
+
+    /* Initalize SMP */
+    initialise_cpus();
+	
+    for (i = 0; i <num_cpus; i++) {
+        dprint(7, i+7, i%10, 1, 0);
+        cprint(8, i+7, "S");
+    }
+
+    dprint(9, 19, num_cpus, 2, 0);
+	
+    if((vv->fail_safe & 3) == 2)
+    {
+        cprint(LINE_CPU,9, "(SMP: Disabled)");
+        cprint(LINE_RAM,9, "Running...");
+    }
+    // dprint(10, 5, found_cpus, 2, 0); 
+
+    /* Find Memory Specs */
+    if(vv->fail_safe & 1) 
+    { 	
+        cprint(LINE_CPU, COL_SPEC, " **** FAIL SAFE **** FAIL SAFE **** ");
+        cprint(LINE_RAM, COL_SPEC, "   No detection, same reliability   ");
+    } else {
+        find_controller();
+        get_spd_spec();
+        if(num_cpus <= 16 && !(vv->fail_safe & 4)) { coretemp(); }
+    }
+
+    if(vv->check_temp > 0 && !(vv->fail_safe & 4))
+    {
+        cprint(LINE_CPU, 26, "|  CPU Temp");
+        cprint(LINE_CPU+1, 26, "|      �C");
+    }
+	
+    beep(600);
+    beep(1000);
+	
+    /* Record the start time */
+    asm __volatile__ ("rdtsc":"=a" (vv->startl),"=d" (vv->starth));
+    vv->snapl = vv->startl;
+    vv->snaph = vv->starth;
+    if (l1_cache == 0) { l1_cache = 64; }
+    if (l2_cache == 0) { l1_cache = 512; }
+    vv->printmode=PRINTMODE_ADDRESSES;
+    vv->numpatn=0;
+}
+
+/* Get cache sizes for most AMD and Intel CPUs, exceptions for old CPUs are
+ * handled in CPU detection */
+void get_cache_size()
+{
+	int i, j, n, size;
+	unsigned int v[4];
+	unsigned char *dp = (unsigned char *)v;
+	struct cpuid4_eax *eax = (struct cpuid4_eax *)&v[0];
+	struct cpuid4_ebx *ebx = (struct cpuid4_ebx *)&v[1];
+	struct cpuid4_ecx *ecx = (struct cpuid4_ecx *)&v[2];
+
+	switch(cpu_id.vend_id.char_array[0]) {
+	/* AMD Processors */
+	case 'A':
+		//l1_cache = cpu_id.cache_info.amd.l1_i_sz;
+		l1_cache = cpu_id.cache_info.amd.l1_d_sz;
+		l2_cache = cpu_id.cache_info.amd.l2_sz;
+		l3_cache = cpu_id.cache_info.amd.l3_sz;
+    l3_cache *= 512;
+		break;
+	case 'G':
+		/* Intel Processors */
+		l1_cache = 0;
+		l2_cache = 0;
+		l3_cache = 0;
+
+		/* Use CPUID(4) if it is available */
+		if (cpu_id.max_cpuid > 3) {
+
+		   /* figure out how many cache leaves */
+		    n = -1;
+		    do 
+		    {
+					++n;
+					/* Do cpuid(4) loop to find out num_cache_leaves */
+					cpuid_count(4, n, &v[0], &v[1], &v[2], &v[3]);
+		    } while ((eax->ctype) != 0);
+
+		    /* loop through all of the leaves */
+		    for (i=0; i<n; i++) 
+		    {
+					cpuid_count(4, i, &v[0], &v[1], &v[2], &v[3]);
+
+					/* Check for a valid cache type */
+					if (eax->ctype == 1 || eax->ctype == 3) 
+					{
+
+			    	/* Compute the cache size */
+			    	size = (ecx->number_of_sets + 1) *
+            	              	  (ebx->coherency_line_size + 1) *
+              	            	  (ebx->physical_line_partition + 1) *
+                	          	  (ebx->ways_of_associativity + 1);
+			    	size /= 1024;
+
+				    switch (eax->level) 
+				    {
+					  	case 1:
+								l1_cache += size;
+								break;
+					    case 2:
+								l2_cache += size;
+								break;
+					    case 3:
+								l3_cache += size;
+								break;
+					  }
+					}
+		    }
+		    return;
+		}
+
+		/* No CPUID(4) so we use the older CPUID(2) method */
+		/* Get number of times to iterate */
+		cpuid(2, &v[0], &v[1], &v[2], &v[3]);
+		n = v[0] & 0xff;
+                for (i=0 ; i<n ; i++) {
+                    cpuid(2, &v[0], &v[1], &v[2], &v[3]);
+
+                    /* If bit 31 is set, this is an unknown format */
+                    for (j=0 ; j<3 ; j++) {
+                            if (v[j] & (1 << 31)) {
+                                    v[j] = 0;
+			    }
+		    }
+
+                    /* Byte 0 is level count, not a descriptor */
+                    for (j = 1 ; j < 16 ; j++) {
+			switch(dp[j]) {
+			case 0x6:
+			case 0xa:
+			case 0x66:
+				l1_cache += 8;
+				break;
+			case 0x8:
+			case 0xc:
+			case 0xd:
+			case 0x60:
+			case 0x67:
+				l1_cache += 16;
+				break;
+			case 0xe:
+				l1_cache += 24;
+				break;
+			case 0x9:
+			case 0x2c:
+			case 0x30:
+			case 0x68:
+				l1_cache += 32;
+				break;
+			case 0x39:
+			case 0x3b:
+			case 0x41:
+			case 0x79:
+				l2_cache += 128;
+				break;
+			case 0x3a:
+				l2_cache += 192;
+				break;
+			case 0x21:
+			case 0x3c:
+			case 0x3f:
+			case 0x42:
+			case 0x7a:
+			case 0x82:
+				l2_cache += 256;
+				break;
+			case 0x3d:
+				l2_cache += 384;
+				break;
+			case 0x3e:
+			case 0x43:
+			case 0x7b:
+			case 0x7f:
+			case 0x80:
+			case 0x83:
+			case 0x86:
+				l2_cache += 512;
+				break;
+			case 0x44:
+			case 0x78:
+			case 0x7c:
+			case 0x84:
+			case 0x87:
+				l2_cache += 1024;
+				break;
+			case 0x45:
+			case 0x7d:
+			case 0x85:
+				l2_cache += 2048;
+				break;
+			case 0x48:
+				l2_cache += 3072;
+				break;
+			case 0x4e:
+				l2_cache += 6144;
+				break;
+			case 0x23:
+			case 0xd0:
+				l3_cache += 512;
+				break;
+			case 0xd1:
+			case 0xd6:
+				l3_cache += 1024;
+				break;
+			case 0x25:
+			case 0xd2:
+			case 0xd7:
+			case 0xdc:
+			case 0xe2:
+				l3_cache += 2048;
+				break;
+			case 0x29:
+			case 0x46:
+			case 0x49:
+			case 0xd8:
+			case 0xdd:
+			case 0xe3:
+				l3_cache += 4096;
+				break;
+			case 0x4a:
+				l3_cache += 6144;
+				break;
+			case 0x47:
+			case 0x4b:
+			case 0xde:
+			case 0xe4:
+				l3_cache += 8192;
+				break;	
+			case 0x4c:
+			case 0xea:
+				l3_cache += 12288;
+				break;	
+			case 0x4d:
+				l3_cache += 16384;
+				break;	
+			case 0xeb:
+				l3_cache += 18432;
+				break;	
+			case 0xec:
+				l3_cache += 24576;
+				break;	
+			} /* end switch */
+		    } /* end for 1-16 */
+		} /* end for 0 - n */
+	}
+}
+
+/*
+ * Find IMC type and set global variables accordingly
+ */
+void detect_imc(void)
+{
+	// Check AMD IMC
+	if(cpu_id.vend_id.char_array[0] == 'A' && cpu_id.vers.bits.family == 0xF) 
+		{
+			switch(cpu_id.vers.bits.extendedFamily)
+					{
+						case 0x0:
+							imc_type = 0x0100; // Old K8
+							break;
+						case 0x1:
+						case 0x2:
+							imc_type = 0x0101; // K10 (Family 10h & 11h)
+							break;
+						case 0x3:
+							imc_type = 0x0102; // A-Series APU (Family 12h)
+							break;
+						case 0x5:
+							imc_type = 0x0103; // C- / E- / Z- Series APU (Family 14h)
+							break;	
+						case 0x6:
+							imc_type = 0x0104; // FX Series (Family 15h)
+							break;								
+						case 0x7:
+							imc_type = 0x0105; // Kabini & related (Family 16h)
+							break;			
+					}	
+			return;
+		}
+					
+	// Check Intel IMC	
+	if(cpu_id.vend_id.char_array[0] == 'G' && cpu_id.vers.bits.family == 6 && cpu_id.vers.bits.extendedModel) 
+		{					
+			switch(cpu_id.vers.bits.model)
+			{
+				case 0x5:
+					if(cpu_id.vers.bits.extendedModel == 2) { imc_type = 0x0003; } // Core i3/i5 1st Gen 45 nm (NHM)
+					if(cpu_id.vers.bits.extendedModel == 3) { vv->fail_safe |= 4; } // Atom Clover Trail
+					if(cpu_id.vers.bits.extendedModel == 4) { imc_type = 0x0007; } // HSW-ULT
+					break;
+				case 0x6:
+					if(cpu_id.vers.bits.extendedModel == 3) { 
+						imc_type = 0x0009;  // Atom Cedar Trail
+						vv->fail_safe |= 4; // Disable Core temp
+					}
+					break;
+				case 0xA:
+					switch(cpu_id.vers.bits.extendedModel)
+					{
+						case 0x1:
+							imc_type = 0x0001; // Core i7 1st Gen 45 nm (NHME)
+							break;
+						case 0x2:
+							imc_type = 0x0004; // Core 2nd Gen (SNB)
+							break;	
+						case 0x3:
+							imc_type = 0x0006; // Core 3nd Gen (IVB)						
+							break;
+					}
+					break;
+				case 0xC:
+					switch(cpu_id.vers.bits.extendedModel)
+					{
+						case 0x1:
+							if(cpu_id.vers.bits.stepping > 9) { imc_type = 0x0008; } // Atom PineView	
+							vv->fail_safe |= 4; // Disable Core temp
+							break;	
+						case 0x2:
+							imc_type = 0x0002; // Core i7 1st Gen 32 nm (WMR)	
+							break;	
+						case 0x3:
+							imc_type = 0x0007; // Core 4nd Gen (HSW)						
+							break;
+					}
+					break;			
+				case 0xD:
+					imc_type = 0x0005; // SNB-E
+					break;				
+				case 0xE:
+					imc_type = 0x0001; // Core i7 1st Gen 45 nm (NHM)
+					break;				
+			}
+		
+		if(imc_type) { tsc_invariable = 1; }
+		return;
+		}
+}
+
+void smp_default_mode(void)
+{
+	int i, result;
+	char *cpupsn = cpu_id.brand_id.char_array;
+  char *disabledcpu[] = { "Opteron", "Xeon", "EPYC", "Genuine Intel" };
+  
+  for(i = 0; i < 3; i++) 
+  {
+      result = mt86_strstr(cpupsn , disabledcpu[i]);
+      if(result != -1) { vv->fail_safe |= 0b10; }
+  }
+  
+  // For 5.01 release, SMP disabled by defualt by config.h toggle
+  if(CONSERVATIVE_SMP) { vv->fail_safe |= 0b10; }
+  	
+}
+
+/*
+ * Find CPU type
+ */
+void cpu_type(void)
+{
+	/* If we can get a brand string use it, and we are done */
+	if (cpu_id.max_xcpuid >= 0x80000004) {
+		cprint(0, COL_MID, cpu_id.brand_id.char_array);
+		//If we have a brand string, maybe we have an IMC. Check that.
+		detect_imc();
+		smp_default_mode();	
+		return;
+	}
+
+	/* The brand string is not available so we need to figure out 
+	 * CPU what we have */
+	switch(cpu_id.vend_id.char_array[0]) {
+	/* AMD Processors */
+	case 'A':
+		switch(cpu_id.vers.bits.family) {
+		case 4:
+			switch(cpu_id.vers.bits.model) {
+			case 3:
+				cprint(0, COL_MID, "AMD 486DX2");
+				break;
+			case 7:
+				cprint(0, COL_MID, "AMD 486DX2-WB");
+				break;
+			case 8:
+				cprint(0, COL_MID, "AMD 486DX4");
+				break;
+			case 9:
+				cprint(0, COL_MID, "AMD 486DX4-WB");
+				break;
+			case 14:
+				cprint(0, COL_MID, "AMD 5x86-WT");
+				break;
+			case 15:
+				cprint(0, COL_MID, "AMD 5x86-WB");
+				break;
+			}
+			/* Since we can't get CPU speed or cache info return */
+			return;
+		case 5:
+			switch(cpu_id.vers.bits.model) {
+			case 0:
+			case 1:
+			case 2:
+			case 3:
+				cprint(0, COL_MID, "AMD K5");
+				l1_cache = 8;
+				break;
+			case 6:
+			case 7:
+				cprint(0, COL_MID, "AMD K6");
+				break;
+			case 8:
+				cprint(0, COL_MID, "AMD K6-2");
+				break;
+			case 9:
+				cprint(0, COL_MID, "AMD K6-III");
+				break;
+			case 13: 
+				cprint(0, COL_MID, "AMD K6-III+"); 
+				break;
+			}
+			break;
+		case 6:
+
+			switch(cpu_id.vers.bits.model) {
+			case 1:
+				cprint(0, COL_MID, "AMD Athlon (0.25)");
+				break;
+			case 2:
+			case 4:
+				cprint(0, COL_MID, "AMD Athlon (0.18)");
+				break;
+			case 6:
+				if (l2_cache == 64) {
+					cprint(0, COL_MID, "AMD Duron (0.18)");
+				} else {
+					cprint(0, COL_MID, "Athlon XP (0.18)");
+				}
+				break;
+			case 8:
+			case 10:
+				if (l2_cache == 64) {
+					cprint(0, COL_MID, "AMD Duron (0.13)");
+				} else {
+					cprint(0, COL_MID, "Athlon XP (0.13)");
+				}
+				break;
+			case 3:
+			case 7:
+				cprint(0, COL_MID, "AMD Duron");
+				/* Duron stepping 0 CPUID for L2 is broken */
+				/* (AMD errata T13)*/
+				if (cpu_id.vers.bits.stepping == 0) { /* stepping 0 */
+					/* Hard code the right L2 size */
+					l2_cache = 64;
+				} else {
+				}
+				break;
+			}
+			break;
+
+			/* All AMD family values >= 10 have the Brand ID
+			 * feature so we don't need to find the CPU type */
+		}
+		break;
+
+	/* Intel or Transmeta Processors */
+	case 'G':
+		if ( cpu_id.vend_id.char_array[7] == 'T' ) { /* GenuineTMx86 */
+			if (cpu_id.vers.bits.family == 5) {
+				cprint(0, COL_MID, "TM 5x00");
+			} else if (cpu_id.vers.bits.family == 15) {
+				cprint(0, COL_MID, "TM 8x00");
+			}
+			l1_cache = cpu_id.cache_info.ch[3] + cpu_id.cache_info.ch[7];
+			l2_cache = (cpu_id.cache_info.ch[11]*256) + cpu_id.cache_info.ch[10];
+		} else {				/* GenuineIntel */
+			if (cpu_id.vers.bits.family == 4) {
+			switch(cpu_id.vers.bits.model) {
+			case 0:
+			case 1:
+				cprint(0, COL_MID, "Intel 486DX");
+				break;
+			case 2:
+				cprint(0, COL_MID, "Intel 486SX");
+				break;
+			case 3:
+				cprint(0, COL_MID, "Intel 486DX2");
+				break;
+			case 4:
+				cprint(0, COL_MID, "Intel 486SL");
+				break;
+			case 5:
+				cprint(0, COL_MID, "Intel 486SX2");
+				break;
+			case 7:
+				cprint(0, COL_MID, "Intel 486DX2-WB");
+				break;
+			case 8:
+				cprint(0, COL_MID, "Intel 486DX4");
+				break;
+			case 9:
+				cprint(0, COL_MID, "Intel 486DX4-WB");
+				break;
+			}
+			/* Since we can't get CPU speed or cache info return */
+			return;
+		}
+
+
+		switch(cpu_id.vers.bits.family) {
+		case 5:
+			switch(cpu_id.vers.bits.model) {
+			case 0:
+			case 1:
+			case 2:
+			case 3:
+			case 7:
+				cprint(0, COL_MID, "Pentium");
+				if (l1_cache == 0) {
+					l1_cache = 8;
+				}
+				break;
+			case 4:
+			case 8:
+				cprint(0, COL_MID, "Pentium-MMX");
+				if (l1_cache == 0) {
+					l1_cache = 16;
+				}
+				break;
+			}
+			break;
+		case 6:
+			switch(cpu_id.vers.bits.model) {
+			case 0:
+			case 1:
+				cprint(0, COL_MID, "Pentium Pro");
+				break;
+			case 3:
+			case 4:
+				cprint(0, COL_MID, "Pentium II");
+				break;
+			case 5:
+				if (l2_cache == 0) {
+					cprint(0, COL_MID, "Celeron");
+				} else {
+					cprint(0, COL_MID, "Pentium II");
+				}
+				break;
+			case 6:
+				  if (l2_cache == 128) {
+					cprint(0, COL_MID, "Celeron");
+				  } else {
+					cprint(0, COL_MID, "Pentium II");
+				  }
+				}
+				break;
+			case 7:
+			case 8:
+			case 11:
+				if (l2_cache == 128) {
+					cprint(0, COL_MID, "Celeron");
+				} else {
+					cprint(0, COL_MID, "Pentium III");
+				}
+				break;
+			case 9:
+				if (l2_cache == 512) {
+					cprint(0, COL_MID, "Celeron M (0.13)");
+				} else {
+					cprint(0, COL_MID, "Pentium M (0.13)");
+				}
+				break;
+     			case 10:
+				cprint(0, COL_MID, "Pentium III Xeon");
+				break;
+			case 12:
+				l1_cache = 24;
+				cprint(0, COL_MID, "Atom (0.045)");
+				break;					
+			case 13:
+				if (l2_cache == 1024) {
+					cprint(0, COL_MID, "Celeron M (0.09)");
+				} else {
+					cprint(0, COL_MID, "Pentium M (0.09)");
+				}
+				break;
+			case 14:
+				cprint(0, COL_MID, "Intel Core");
+				break;				
+			case 15:
+				if (l2_cache == 1024) {
+					cprint(0, COL_MID, "Pentium E");
+				} else {
+					cprint(0, COL_MID, "Intel Core 2");
+				}
+				break;
+			}
+			break;
+		case 15:
+			switch(cpu_id.vers.bits.model) {
+			case 0:
+			case 1:			
+			case 2:
+				if (l2_cache == 128) {
+					cprint(0, COL_MID, "Celeron");
+				} else {
+					cprint(0, COL_MID, "Pentium 4");
+				}
+				break;
+			case 3:
+			case 4:
+				if (l2_cache == 256) {
+					cprint(0, COL_MID, "Celeron (0.09)");
+				} else {
+					cprint(0, COL_MID, "Pentium 4 (0.09)");
+				}
+				break;
+			case 6:
+				cprint(0, COL_MID, "Pentium D (65nm)");
+				break;
+			default:
+				cprint(0, COL_MID, "Unknown Intel");
+ 				break;
+			break;
+		    }
+
+		}
+		break;
+
+	/* VIA/Cyrix/Centaur Processors with CPUID */
+	case 'C':
+		if ( cpu_id.vend_id.char_array[1] == 'e' ) { /* CentaurHauls */
+			l1_cache = cpu_id.cache_info.ch[3] + cpu_id.cache_info.ch[7];
+			l2_cache = cpu_id.cache_info.ch[11];
+			switch(cpu_id.vers.bits.family){
+			case 5:
+				cprint(0, COL_MID, "Centaur 5x86");
+				break;
+			case 6: // VIA C3
+				switch(cpu_id.vers.bits.model){
+				default:
+				    if (cpu_id.vers.bits.stepping < 8) {
+					cprint(0, COL_MID, "VIA C3 Samuel2");
+				    } else {
+					cprint(0, COL_MID, "VIA C3 Eden");
+				    }
+				break;
+				case 10:
+					cprint(0, COL_MID, "VIA C7 (C5J)");
+					l1_cache = 64;
+					l2_cache = 128;
+					break;
+				case 13:
+					cprint(0, COL_MID, "VIA C7 (C5R)");
+					l1_cache = 64;
+					l2_cache = 128;
+					break;
+				case 15:
+					cprint(0, COL_MID, "VIA Isaiah (CN)");
+					l1_cache = 64;
+					l2_cache = 128;
+					break;
+				}
+			}
+		} else {				/* CyrixInstead */
+			switch(cpu_id.vers.bits.family) {
+			case 5:
+				switch(cpu_id.vers.bits.model) {
+				case 0:
+					cprint(0, COL_MID, "Cyrix 6x86MX/MII");
+					break;
+				case 4:
+					cprint(0, COL_MID, "Cyrix GXm");
+					break;
+				}
+				return;
+
+			case 6: // VIA C3
+				switch(cpu_id.vers.bits.model) {
+				case 6:
+					cprint(0, COL_MID, "Cyrix III");
+					break;
+				case 7:
+					if (cpu_id.vers.bits.stepping < 8) {
+						cprint(0, COL_MID, "VIA C3 Samuel2");
+					} else {
+						cprint(0, COL_MID, "VIA C3 Ezra-T");
+					}
+					break;
+				case 8:
+					cprint(0, COL_MID, "VIA C3 Ezra-T");
+					break;
+				case 9:
+					cprint(0, COL_MID, "VIA C3 Nehemiah");
+					break;
+				}
+				// L1 = L2 = 64 KB from Cyrix III to Nehemiah
+				l1_cache = 64;
+				l2_cache = 64;
+				break;
+			}
+		}
+		break;
+	/* Unknown processor */
+	default:
+		/* Make a guess at the family */
+		switch(cpu_id.vers.bits.family) {
+		case 5:
+			cprint(0, COL_MID, "586");
+		case 6:
+			cprint(0, COL_MID, "686");
+		default:
+			cprint(0, COL_MID, "Unidentified Processor");
+		}
+	}
+}
+
+#define STEST_ADDR 0x100000	/* Measure memory speed starting at 1MB */
+
+/* Measure and display CPU and cache sizes and speeds */
+void cpu_cache_speed()
+{
+	int i, off = 4;
+	ulong speed;
+
+
+	/* Print CPU speed */
+	if ((speed = cpuspeed()) > 0) {
+		if (speed < 999499) {
+			speed += 50; /* for rounding */
+			cprint(1, off, "    . MHz");
+			dprint(1, off+1, speed/1000, 3, 1);
+			dprint(1, off+5, (speed/100)%10, 1, 0);
+		} else {
+			speed += 500; /* for rounding */
+			cprint(1, off, "      MHz");
+			dprint(1, off, speed/1000, 5, 0);
+		}
+		extclock = speed;
+	}
+
+	/* Print out L1 cache info */
+	/* To measure L1 cache speed we use a block size that is 1/4th */
+	/* of the total L1 cache size since half of it is for instructions */
+	if (l1_cache) {
+		cprint(2, 0, "L1 Cache:     K  ");
+		dprint(2, 11, l1_cache, 3, 0);
+		if ((speed=memspeed(STEST_ADDR, (l1_cache/2)*1024, 200))) {
+			cprint(2, 16, "       MB/s");
+			dprint(2, 16, speed, 6, 0);
+		}
+	}
+
+	/* Print out L2 cache info */
+	/* We measure the L2 cache speed by using a block size that is */
+	/* the size of the L1 cache.  We have to fudge if the L1 */
+	/* cache is bigger than the L2 */
+	if (l2_cache) {
+		cprint(3, 0, "L2 Cache:     K  ");
+		dprint(3, 10, l2_cache, 4, 0);
+
+		if (l2_cache < l1_cache) {
+			i = l1_cache / 4 + l2_cache / 4;
+		} else {
+			i = l1_cache;
+		}
+		if ((speed=memspeed(STEST_ADDR, i*1024, 200))) {
+			cprint(3, 16, "       MB/s");
+			dprint(3, 16, speed, 6, 0);
+		}
+	}
+	/* Print out L3 cache info */
+	/* We measure the L3 cache speed by using a block size that is */
+	/* 2X the size of the L2 cache. */
+
+	if (l3_cache) 
+	{
+		cprint(4, 0, "L3 Cache:     K  ");
+   	aprint(4, 10, l3_cache/4);
+    //dprint(4, 10, l3_cache, 4, 0);
+    
+    		i = l2_cache*2;
+    
+    		if ((speed=memspeed(STEST_ADDR, i*1024, 150))) {
+    			cprint(4, 16, "       MB/s");
+    			dprint(4, 16, speed, 6, 0);
+    		}
+   }
+}
+
+/* Measure and display memory speed, multitasked using all CPUs */
+ulong spd[MAX_CPUS];
+void get_mem_speed(int me, int ncpus)
+{
+	int i;
+	ulong speed=0;
+
+   /* Determine memory speed.  To find the memory speed we use 
+   * A block size that is the sum of all the L1, L2 & L3 caches
+	 * in all cpus * 6 */
+   i = (l3_cache + l2_cache + l1_cache) * 4;
+
+	/* Make sure that we have enough memory to do the test */
+	/* If not use all we have */
+	if ((1 + (i * 2)) > (vv->plim_upper << 2)) {
+		i = ((vv->plim_upper <<2) - 1) / 2;
+	}
+	
+	speed = memspeed(STEST_ADDR, i * 1024, 100);
+	cprint(5, 16, "       MB/s");
+	dprint(5, 16, speed, 6, 0);
+	
+}
+
+/* #define TICKS 5 * 11832 (count = 6376)*/
+/* #define TICKS (65536 - 12752) */
+#define TICKS 59659	/* 50 ms */
+
+/* Returns CPU clock in khz */
+ulong stlow, sthigh;
+static int cpuspeed(void)
+{
+	int loops;
+	ulong end_low, end_high;
+
+	if (cpu_id.fid.bits.rdtsc == 0 ) {
+		return(-1);
+	}
+
+	/* Setup timer */
+	outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+	outb(0xb0, 0x43); 
+	outb(TICKS & 0xff, 0x42);
+	outb(TICKS >> 8, 0x42);
+
+	asm __volatile__ ("rdtsc":"=a" (stlow),"=d" (sthigh));
+
+	loops = 0;
+	do {
+		loops++;
+	} while ((inb(0x61) & 0x20) == 0);
+
+	asm __volatile__ (
+		"rdtsc\n\t" \
+		"subl stlow,%%eax\n\t" \
+		"sbbl sthigh,%%edx\n\t" \
+		:"=a" (end_low), "=d" (end_high)
+	);
+
+	/* Make sure we have a credible result */
+	if (loops < 4 || end_low < 50000) {
+		return(-1);
+	}
+	vv->clks_msec = end_low/50;
+
+	if (tsc_invariable) end_low = correct_tsc(end_low);
+
+	return(vv->clks_msec);
+}
+
+/* Measure cache speed by copying a block of memory. */
+/* Returned value is kbytes/second */
+ulong memspeed(ulong src, ulong len, int iter)
+{
+	//int i;
+	//ulong dst, wlen;
+	//ulong st_low, st_high;
+	ulong end_low, end_high;
+	//ulong cal_low, cal_high;
+
+	if (cpu_id.fid.bits.rdtsc == 0 ) {
+		return(-1);
+	}
+	if (len == 0) return(-2);
+
+	//dst = src + len;
+	//wlen = len / 4;  /* Length is bytes */
+
+	/* Calibrate the overhead with a zero word copy */
+/*	asm __volatile__ ("rdtsc":"=a" (st_low),"=d" (st_high));
+	for (i=0; i<iter; i++) {
+		asm __volatile__ (
+			"movl %0,%%esi\n\t" \
+ 		 	"movl %1,%%edi\n\t" \
+ 		 	"movl %2,%%ecx\n\t" \
+ 		 	"cld\n\t" \
+ 		 	"rep\n\t" \
+ 		 	"movsl\n\t" \
+			:: "g" (src), "g" (dst), "g" (0)
+			: "esi", "edi", "ecx"
+		);
+	}
+	asm __volatile__ ("rdtsc":"=a" (cal_low),"=d" (cal_high));
+*/
+	/* Compute the overhead time *//*
+	asm __volatile__ (
+		"subl %2,%0\n\t"
+		"sbbl %3,%1"
+		:"=a" (cal_low), "=d" (cal_high)
+		:"g" (st_low), "g" (st_high),
+		"0" (cal_low), "1" (cal_high)
+	);*/
+
+
+	/* Now measure the speed */
+	/* Do the first copy to prime the cache */
+/*	asm __volatile__ (
+		"movl %0,%%esi\n\t" \
+		"movl %1,%%edi\n\t" \
+ 	 	"movl %2,%%ecx\n\t" \
+ 	 	"cld\n\t" \
+ 	 	"rep\n\t" \
+ 	 	"movsl\n\t" \
+		:: "g" (src), "g" (dst), "g" (wlen)
+		: "esi", "edi", "ecx"
+	);
+	asm __volatile__ ("rdtsc":"=a" (st_low),"=d" (st_high));
+	for (i=0; i<iter; i++) {
+	        asm __volatile__ (
+			"movl %0,%%esi\n\t" \
+			"movl %1,%%edi\n\t" \
+ 		 	"movl %2,%%ecx\n\t" \
+ 		 	"cld\n\t" \
+ 		 	"rep\n\t" \
+ 		 	"movsl\n\t" \
+			:: "g" (src), "g" (dst), "g" (wlen)
+			: "esi", "edi", "ecx"
+		);
+	}
+	asm __volatile__ ("rdtsc":"=a" (end_low),"=d" (end_high));*/
+
+	/* Compute the elapsed time */
+/*	asm __volatile__ (
+		"subl %2,%0\n\t"
+		"sbbl %3,%1"
+		:"=a" (end_low), "=d" (end_high)
+		:"g" (st_low), "g" (st_high),
+		"0" (end_low), "1" (end_high)
+	);*/
+	/* Subtract the overhead time */
+/*	asm __volatile__ (
+		"subl %2,%0\n\t"
+		"sbbl %3,%1"
+		:"=a" (end_low), "=d" (end_high)
+		:"g" (cal_low), "g" (cal_high),
+		"0" (end_low), "1" (end_high)
+	);
+*/
+	/* Make sure that the result fits in 32 bits */
+	//hprint(11,40,end_high);
+	if (end_high) {
+		return(-3);
+	}
+	end_low /= 2;
+
+	/* Convert to clocks/KB */
+	end_low /= len;
+	end_low *= 1024;
+	end_low /= iter;
+	if (end_low == 0) {
+		return(-4);
+	}
+
+	/* Convert to kbytes/sec */
+
+	if (tsc_invariable) end_low = correct_tsc(end_low);
+
+	return((vv->clks_msec)/end_low);
+}
+
+#define rdmsr(msr,val1,val2) \
+	__asm__ __volatile__("rdmsr" \
+		  : "=a" (val1), "=d" (val2) \
+		  : "c" (msr))
+
+
+ulong correct_tsc(ulong el_org)
+{
+	float coef_now, coef_max;
+	int msr_lo, msr_hi, is_xe;
+	
+	rdmsr(0x198, msr_lo, msr_hi);
+	is_xe = (msr_lo >> 31) & 0x1;		
+	
+	if(is_xe){
+		rdmsr(0x198, msr_lo, msr_hi);
+		coef_max = ((msr_hi >> 8) & 0x1F);	
+		if ((msr_hi >> 14) & 0x1) { coef_max = coef_max + 0.5f; }
+	} else {
+		rdmsr(0x17, msr_lo, msr_hi);
+		coef_max = ((msr_lo >> 8) & 0x1F);
+		if ((msr_lo >> 14) & 0x1) { coef_max = coef_max + 0.5f; }
+	}
+	
+	if(cpu_id.fid.bits.eist) {
+		rdmsr(0x198, msr_lo, msr_hi);
+		coef_now = ((msr_lo >> 8) & 0x1F);
+		if ((msr_lo >> 14) & 0x1) { coef_now = coef_now + 0.5f; }
+	} else {
+		rdmsr(0x2A, msr_lo, msr_hi);
+		coef_now = (msr_lo >> 22) & 0x1F;
+	}
+	if(coef_max && coef_now) {
+		el_org = (ulong)(el_org * coef_now / coef_max);
+	}
+	return el_org;
+}
+
diff --git a/efi_memtest/logger.c b/efi_memtest/memtest86+/efi/logger.c
index e69de29..e69de29 100644
--- a/efi_memtest/logger.c
+++ b/efi_memtest/memtest86+/efi/logger.c
diff --git a/efi_memtest/memtest86+/efi/logger.h b/efi_memtest/memtest86+/efi/logger.h
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/efi_memtest/memtest86+/efi/logger.h
diff --git a/efi_memtest/memtest86+/efi/main.h b/efi_memtest/memtest86+/efi/main.h
index e69de29..6885f35 100644
--- a/efi_memtest/memtest86+/efi/main.h
+++ b/efi_memtest/memtest86+/efi/main.h
@@ -0,0 +1 @@
+void test_start(void);
+\ No newline at end of file
diff --git a/efi_memtest/memtest86+/efi/main_asm.h b/efi_memtest/memtest86+/efi/main_asm.h
new file mode 100644
index 0000000..8e6efbc
--- /dev/null
+++ b/efi_memtest/memtest86+/efi/main_asm.h
@@ -0,0 +1,49 @@
+static inline void enable_fp_processing(void) {
+	   if (cpu_id.fid.bits.fpu)
+            __asm__ __volatile__
+                (
+                 "movl %%cr0, %%eax\n\t"
+                 "andl $0x7, %%eax\n\t"
+                 "movl %%eax, %%cr0\n\t"
+                 : :
+                 : "ax"
+                 );
+        if (cpu_id.fid.bits.sse)
+            __asm__ __volatile__
+                (
+                 "movl %%cr4, %%eax\n\t"
+                 "orl $0x00000200, %%eax\n\t"
+                 "movl %%eax, %%cr4\n\t"
+                 : :
+                 : "ax"
+                 );
+
+}
+
+static inline void setup_mm_modes(void) {
+	        /* If we have PAE, turn it on */
+        if (cpu_id.fid.bits.pae == 1) {
+            __asm__ __volatile__
+                (
+                 "movl %%cr4, %%eax\n\t"
+                 "orl $0x00000020, %%eax\n\t"
+                 "movl %%eax, %%cr4\n\t"
+                 : :
+                 : "ax"
+                 );
+            cprint(LINE_TITLE+1, COL_MODE, "(PAE Mode)");
+        }
+        /* If this is a 64 CPU enable long mode */
+        if (cpu_id.fid.bits.lm == 1) {
+            __asm__ __volatile__
+                (
+                 "movl $0xc0000080, %%ecx\n\t"
+                 "rdmsr\n\t"
+                 "orl $0x00000100, %%eax\n\t"
+                 "wrmsr\n\t"
+                 : :
+                 : "ax", "cx"
+                 );
+            cprint(LINE_TITLE+1, COL_MODE, "(X64 Mode)");
+        }
+}
+\ No newline at end of file
diff --git a/efi_memtest/memtest86+/efi/test.c b/efi_memtest/memtest86+/efi/test.c
new file mode 100644
index 0000000..c4e0873
--- /dev/null
+++ b/efi_memtest/memtest86+/efi/test.c
@@ -0,0 +1,1551 @@
+/* test.c - MemTest-86  Version 3.4
+ *
+ * Released under version 2 of the Gnu Public License.
+ * By Chris Brady
+ * ----------------------------------------------------
+ * MemTest86+ V5 Specific code (GPL V2.0)
+ * By Samuel DEMEULEMEESTER, sdemeule@memtest.org
+ * http://www.canardpc.com - http://www.memtest.org
+ * Thanks to Passmark for calculate_chunk() and various comments !
+ */
+
+#include "test.h"
+#include "config.h"
+#include "stdint.h"
+#include "cpuid.h"
+#include "smp.h"
+#include "io.h"
+
+extern struct cpu_ident cpu_id;
+extern volatile int    mstr_cpu;
+extern volatile int    run_cpus;
+extern volatile int    test;
+extern volatile int segs, bail;
+extern int test_ticks, nticks;
+extern struct tseq tseq[];
+extern void update_err_counts(void);
+extern void print_err_counts(void);
+void rand_seed( unsigned int seed1, unsigned int seed2, int me);
+ulong rand(int me);
+void poll_errors();
+
+// NOTE(jcoiner):
+//  Defining 'STATIC' to empty string results in crashes. (It should
+//  work fine, of course.) I suspect relocation problems in reloc.c.
+//  When we declare these routines static, we use relative addresses
+//  for them instead of looking up their addresses in (supposedly
+//  relocated) global elf tables, which avoids the crashes.
+
+#define STATIC static
+//#define STATIC
+
+#define PREFER_C 0
+
+static const void* const nullptr = 0x0;
+
+// Writes *start and *end with the VA range to test.
+//
+// me - this threads CPU number
+// j - index into v->map for current segment we are testing
+// align - number of bytes to align each block to
+STATIC void calculate_chunk(ulong** start, ulong** end, int me,
+                            int j, int makeMultipleOf) {
+    ulong chunk;
+
+    // If we are only running 1 CPU then test the whole block
+    if (run_cpus == 1) {
+        *start = vv->map[j].start;
+        *end = vv->map[j].end;
+    } else {
+
+        // Divide the current segment by the number of CPUs
+        chunk = (ulong)vv->map[j].end-(ulong)vv->map[j].start;
+        chunk /= run_cpus;
+		
+        // Round down to the nearest desired bitlength multiple
+        chunk = (chunk + (makeMultipleOf-1)) &  ~(makeMultipleOf-1);
+
+        // Figure out chunk boundaries
+        *start = (ulong*)((ulong)vv->map[j].start+(chunk*me));
+        /* Set end addrs for the highest CPU num to the
+         * end of the segment for rounding errors */
+        /* Also rounds down to boundary if needed, may miss some ram but
+           better than crashing or producing false errors. */
+        /* This rounding probably will never happen as the segments should
+           be in 4096 bytes pages if I understand correctly. */
+        if (me == mstr_cpu) {
+            *end = (ulong*)(vv->map[j].end);
+        } else {
+            *end = (ulong*)((ulong)(*start) + chunk);
+            (*end)--;
+        }
+    }
+}
+
+/* Call segment_fn() for each up-to-SPINSZ segment between
+ * 'start' and 'end'.
+ */
+void foreach_segment
+(ulong* start, ulong* end,
+ int me, const void* ctx, segment_fn func) {
+
+    ASSERT(start < end);
+
+    // Confirm 'start' points to an even dword, and 'end'
+    // should point to an odd dword
+    ASSERT(0   == (((ulong)start) & 0x7));
+    ASSERT(0x4 == (((ulong)end)   & 0x7));
+
+    // 'end' may be exactly 0xfffffffc, right at the 4GB boundary.
+    //
+    // To avoid overflow in our loop tests and length calculations,
+    // use dword indices (the '_dw' vars) to avoid overflows.
+    ulong start_dw = ((ulong)start) >> 2;
+    ulong   end_dw = ((ulong)  end) >> 2;
+
+    // end is always xxxxxffc, but increment end_dw to an
+    // address beyond the segment for easier boundary calculations.
+    ++end_dw;
+
+    ulong seg_dw     = start_dw;
+    ulong seg_end_dw = start_dw;
+
+    int done = 0;
+    do {
+        do_tick(me);
+        { BAILR }
+
+        // ensure no overflow
+        ASSERT((seg_end_dw + SPINSZ_DWORDS) > seg_end_dw);
+        seg_end_dw += SPINSZ_DWORDS;
+
+        if (seg_end_dw >= end_dw) {
+            seg_end_dw = end_dw;
+            done++;
+        }
+        if (seg_dw == seg_end_dw) {
+            break;
+        }
+
+        ASSERT(((ulong)seg_end_dw) <= 0x40000000);
+        ASSERT(seg_end_dw > seg_dw);
+        ulong seg_len_dw = seg_end_dw - seg_dw;
+
+        func((ulong*)(seg_dw << 2), seg_len_dw, ctx);
+
+        seg_dw = seg_end_dw;
+    } while (!done);
+}
+
+/* Calls segment_fn() for each segment in vv->map.
+ *
+ * Does not slice by CPU number, so it covers the entire memory.
+ * Contrast to sliced_foreach_segment().
+ */
+STATIC void unsliced_foreach_segment
+(const void* ctx, int me, segment_fn func) {
+    int j;
+    for (j=0; j<segs; j++) {
+        foreach_segment(vv->map[j].start,
+                        vv->map[j].end,
+                        me, ctx, func);
+    }
+}
+
+/* Calls segment_fn() for each segment to be tested by CPU 'me'.
+ *
+ * In multicore mode, slices the segments by 'me' (the CPU ordinal
+ * number) so that each call will cover only 1/Nth of memory.
+ */
+STATIC void sliced_foreach_segment
+(const void *ctx, int me, segment_fn func) {
+    int j;
+    ulong *start, *end;  // VAs
+    ulong* prev_end = 0;
+    for (j=0; j<segs; j++) {
+        calculate_chunk(&start, &end, me, j, 64);
+
+        // Ensure no overlap among chunks
+        ASSERT(end > start);
+        if (prev_end > 0) {
+            ASSERT(prev_end < start);
+        }
+        prev_end = end;
+
+        foreach_segment(start, end, me, ctx, func);
+    }
+}
+
+STATIC void addr_tst1_seg(ulong* restrict buf,
+                          ulong len_dw, const void* unused) {
+    // Within each segment:
+    //  - choose a low dword offset 'off'
+    //  - write pat to *off
+    //  - write ~pat to addresses that are above off by
+    //    1, 2, 4, ... dwords up to the top of the segment. None
+    //    should alias to the original dword.
+    //  - write ~pat to addresses that are below off by
+    //    1, 2, 4, etc dwords, down to the start of the segment. None
+    //    should alias to the original dword. If adding a given offset
+    //    doesn't produce a single bit address flip (because it produced
+    //    a carry) subtracting the same offset should give a single bit flip.
+    //  - repeat this, moving off ahead in increments of 1MB;
+    //    this covers address bits within physical memory banks, we hope?
+
+    ulong pat;
+    int k;
+
+    for (pat=0x5555aaaa, k=0; k<2; k++) {
+        hprint(LINE_PAT, COL_PAT, pat);
+
+        for (ulong off_dw = 0; off_dw < len_dw; off_dw += (1 << 18)) {
+            buf[off_dw] = pat;
+            pat = ~pat;
+
+            for (ulong more_off_dw = 1; off_dw + more_off_dw < len_dw;
+                 more_off_dw = more_off_dw << 1) {
+                ASSERT(more_off_dw);  // it should never get to zero
+                buf[off_dw + more_off_dw] = pat;
+                ulong bad;
+                if ((bad = buf[off_dw]) != ~pat) {
+                    ad_err1(buf + off_dw,
+                            buf + off_dw + more_off_dw,
+                            bad, ~pat);
+                    break;
+                }
+            }
+            for (ulong more_off_dw = 1; off_dw > more_off_dw;
+                 more_off_dw = more_off_dw << 1) {
+                ASSERT(more_off_dw);  // it should never get to zero
+                buf[off_dw - more_off_dw] = pat;
+                ulong bad;
+                if ((bad = buf[off_dw]) != ~pat) {
+                    ad_err1(buf + off_dw,
+                            buf + off_dw - more_off_dw,
+                            bad, ~pat);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+/*
+ * Memory address test, walking ones
+ */
+void addr_tst1(int me)
+{
+    unsliced_foreach_segment(nullptr, me, addr_tst1_seg);
+}
+
+STATIC void addr_tst2_init_segment(ulong* p,
+                                   ulong len_dw, const void* unused) {
+    ulong* pe = p + (len_dw - 1);
+
+    /* Original C code replaced with hand tuned assembly code
+     *			for (; p <= pe; p++) {
+     *				*p = (ulong)p;
+     *			}
+     */
+    asm __volatile__ (
+                      "jmp L91\n\t"
+                      ".p2align 4,,7\n\t"
+                      "L90:\n\t"
+                      "addl $4,%%edi\n\t"
+                      "L91:\n\t"
+                      "movl %%edi,(%%edi)\n\t"
+                      "cmpl %%edx,%%edi\n\t"
+                      "jb L90\n\t"
+                      : : "D" (p), "d" (pe)
+                      );
+}
+
+STATIC void addr_tst2_check_segment(ulong* p,
+                                    ulong len_dw, const void* unused) {
+    ulong* pe = p + (len_dw - 1);
+
+    /* Original C code replaced with hand tuned assembly code
+     *			for (; p <= pe; p++) {
+     *				if((bad = *p) != (ulong)p) {
+     *					ad_err2((ulong)p, bad);
+     *				}
+     *			}
+     */
+    asm __volatile__
+        (
+         "jmp L95\n\t"
+         ".p2align 4,,7\n\t"
+         "L99:\n\t"
+         "addl $4,%%edi\n\t"
+         "L95:\n\t"
+         "movl (%%edi),%%ecx\n\t"
+         "cmpl %%edi,%%ecx\n\t"
+         "jne L97\n\t"
+         "L96:\n\t"
+         "cmpl %%edx,%%edi\n\t"
+         "jb L99\n\t"
+         "jmp L98\n\t"
+
+         "L97:\n\t"
+         "pushq %%rdx\n\t"
+         "pushq %%rcx\n\t"
+         "pushq %%rdi\n\t"
+         "call ad_err2\n\t"
+         "popq %%rdi\n\t"
+         "popq %%rcx\n\t"
+         "popq %%rdx\n\t"
+         "jmp L96\n\t"
+
+         "L98:\n\t"
+         : : "D" (p), "d" (pe)
+         : "ecx"
+         );
+}
+
+/*
+ * Memory address test, own address
+ */
+void addr_tst2(int me)
+{
+    cprint(LINE_PAT, COL_PAT, "address ");
+
+    /* Write each address with its own address */
+    unsliced_foreach_segment(nullptr, me, addr_tst2_init_segment);
+    { BAILR }
+
+    /* Each address should have its own address */
+    unsliced_foreach_segment(nullptr, me, addr_tst2_check_segment);
+}
+
+typedef struct {
+    int me;
+    ulong xorVal;    
+} movinvr_ctx;
+
+STATIC void movinvr_init(ulong* p,
+                         ulong len_dw, const void* vctx) {
+    ulong* pe = p + (len_dw - 1);
+    const movinvr_ctx* ctx = (const movinvr_ctx*)vctx;
+    /* Original C code replaced with hand tuned assembly code */
+    /*
+      for (; p <= pe; p++) {
+      *p = rand(me);
+      }
+    */
+
+    asm __volatile__
+        (
+         "jmp L200\n\t"
+         ".p2align 4,,7\n\t"
+         "L201:\n\t"
+         "addl $4,%%edi\n\t"
+         "L200:\n\t"
+         "pushq %%rcx\n\t"
+         "call rand\n\t"
+         "popq %%rcx\n\t"
+         "movl %%eax,(%%edi)\n\t"
+         "cmpl %%ebx,%%edi\n\t"
+         "jb L201\n\t"
+         : : "D" (p), "b" (pe), "c" (ctx->me)
+         : "eax"
+         );
+}
+
+STATIC void movinvr_body(ulong* p, ulong len_dw, const void* vctx) {
+    ulong* pe = p + (len_dw - 1);
+    const movinvr_ctx* ctx = (const movinvr_ctx*)vctx;
+
+    /* Original C code replaced with hand tuned assembly code */
+				
+    /*for (; p <= pe; p++) {
+      num = rand(me);
+      if (i) {
+      num = ~num;
+      }
+      if ((bad=*p) != num) {
+      mt86_error((ulong*)p, num, bad);
+      }
+      *p = ~num;
+      }*/
+
+    asm __volatile__
+        (
+         "pushq %%rbp\n\t"
+
+         // Skip first increment
+         "jmp L26\n\t"
+         ".p2align 4,,7\n\t"
+
+         // increment 4 bytes (32-bits)
+         "L27:\n\t"
+         "addl $4,%%edi\n\t"
+
+         // Check this byte
+         "L26:\n\t"
+
+         // Get next random number, pass in me(edx), random value returned in num(eax)
+         // num = rand(me);
+         // cdecl call maintains all registers except eax, ecx, and edx
+         // We maintain edx with a push and pop here using it also as an input
+         // we don't need the current eax value and want it to change to the return value
+         // we overwrite ecx shortly after this discarding its current value
+         "pushq %%rdx\n\t" // Push function inputs onto stack
+         "call rand\n\t"
+         "popq %%rdx\n\t" // Remove function inputs from stack
+
+         // XOR the random number with xorVal(ebx), which is either 0xffffffff or 0 depending on the outer loop
+         // if (i) { num = ~num; }
+         "xorl %%ebx,%%eax\n\t"
+
+         // Move the current value of the current position p(edi) into bad(ecx)
+         // (bad=*p)
+         "movl (%%edi),%%ecx\n\t"
+
+         // Compare bad(ecx) to num(eax)
+         "cmpl %%eax,%%ecx\n\t"
+
+         // If not equal jump the error case
+         "jne L23\n\t"
+
+         // Set a new value or not num(eax) at the current position p(edi)
+         // *p = ~num;
+         "L25:\n\t"
+         "movl $0xffffffff,%%ebp\n\t"
+         "xorl %%ebp,%%eax\n\t"
+         "movl %%eax,(%%edi)\n\t"
+
+         // Loop until current position p(edi) equals the end position pe(esi)
+         "cmpl %%esi,%%edi\n\t"
+         "jb L27\n\t"
+         "jmp L24\n"
+
+         // Error case
+         "L23:\n\t"
+         // Must manually maintain eax, ecx, and edx as part of cdecl call convention
+         "pushq %%rdx\n\t"
+         "pushq %%rcx\n\t" // Next three pushes are functions input
+         "pushq %%rax\n\t"
+         "pushq %%rdi\n\t"
+         "call mt86_error\n\t"
+         "popq %%rdi\n\t" // Remove function inputs from stack and restore register values
+         "popq %%rax\n\t"
+         "popq %%rcx\n\t"
+         "popq %%rdx\n\t"
+         "jmp L25\n" 
+
+         "L24:\n\t"
+         "popq %%rbp\n\t"
+         :: "D" (p), "S" (pe), "b" (ctx->xorVal),
+          "d" (ctx->me)
+         : "eax", "ecx"
+         );
+}
+
+/*
+ * Test all of memory using a "half moving inversions" algorithm using random
+ * numbers and their complement as the data pattern. Since we are not able to
+ * produce random numbers in reverse order testing is only done in the forward
+ * direction.
+ */
+void movinvr(int me)
+{
+    int i, seed1, seed2;
+
+    movinvr_ctx ctx;
+    ctx.me = me;
+    ctx.xorVal = 0;
+
+    /* Initialize memory with initial sequence of random numbers.  */
+    if (cpu_id.fid.bits.rdtsc) {
+        asm __volatile__ ("rdtsc":"=a" (seed1),"=d" (seed2));
+    } else {
+        seed1 = 521288629 + vv->pass;
+        seed2 = 362436069 - vv->pass;
+    }
+
+    /* Display the current seed */
+    if (mstr_cpu == me) hprint(LINE_PAT, COL_PAT, seed1);
+    rand_seed(seed1, seed2, me);
+
+    sliced_foreach_segment(&ctx, me, movinvr_init);
+    { BAILR }
+
+    /* Do moving inversions test. Check for initial pattern and then
+     * write the complement for each memory location.
+     */
+    for (i=0; i<2; i++) {
+        rand_seed(seed1, seed2, me);
+
+        if (i) {
+            ctx.xorVal = 0xffffffff;
+        } else {
+            ctx.xorVal = 0;
+        }
+
+        sliced_foreach_segment(&ctx, me, movinvr_body);
+        { BAILR }
+    }
+}
+
+typedef struct {
+    ulong p1;
+    ulong p2;
+} movinv1_ctx;
+
+STATIC void movinv1_init(ulong* start,
+                         ulong len_dw, const void* vctx) {
+    const movinv1_ctx* ctx = (const movinv1_ctx*)vctx;
+
+    ulong p1 = ctx->p1;
+    ulong* p = start;
+
+    asm __volatile__
+        (
+         "rep\n\t"
+         "stosl\n\t"
+         : : "c" (len_dw), "D" (p), "a" (p1)
+         );
+}
+
+STATIC void movinv1_bottom_up(ulong* start,
+                              ulong len_dw, const void* vctx) {
+    const movinv1_ctx* ctx = (const movinv1_ctx*)vctx;
+    ulong p1 = ctx->p1;
+    ulong p2 = ctx->p2;
+    ulong* p = start;
+    ulong* pe = p + (len_dw - 1);
+
+    // Original C code replaced with hand tuned assembly code 
+    // seems broken
+    /*for (; p <= pe; p++) {
+      if ((bad=*p) != p1) {
+      mt86_error((ulong*)p, p1, bad);
+      }
+      *p = p2;
+      }*/
+
+    asm __volatile__
+        (
+         "jmp L2\n\t"
+         ".p2align 4,,7\n\t"
+         "L0:\n\t"
+         "addl $4,%%edi\n\t"
+         "L2:\n\t"
+         "movl (%%edi),%%ecx\n\t"
+         "cmpl %%eax,%%ecx\n\t"
+         "jne L3\n\t"
+         "L5:\n\t"
+         "movl %%ebx,(%%edi)\n\t"
+         "cmpl %%edx,%%edi\n\t"
+         "jb L0\n\t"
+         "jmp L4\n"
+
+         "L3:\n\t"
+         "pushq %%rdx\n\t"
+         "pushq %%rbx\n\t"
+         "pushq %%rcx\n\t"
+         "pushq %%rax\n\t"
+         "pushq %%rdi\n\t"
+         "call mt86_error\n\t"
+         "popq %%rdi\n\t"
+         "popq %%rax\n\t"
+         "popq %%rcx\n\t"
+         "popq %%rbx\n\t"
+         "popq %%rdx\n\t"
+         "jmp L5\n"
+
+         "L4:\n\t"
+         :: "a" (p1), "D" (p), "d" (pe), "b" (p2)
+         : "ecx"
+         );
+}
+
+STATIC void movinv1_top_down(ulong* start,
+                             ulong len_dw, const void* vctx) {
+    const movinv1_ctx* ctx = (const movinv1_ctx*)vctx;
+    ulong p1 = ctx->p1;
+    ulong p2 = ctx->p2;
+    ulong* p = start + (len_dw - 1);
+    ulong* pe = start;
+
+    //Original C code replaced with hand tuned assembly code
+    // seems broken
+    /*do {
+      if ((bad=*p) != p2) {
+      mt86_error((ulong*)p, p2, bad);
+      }
+      *p = p1;
+      } while (--p >= pe);*/
+
+    asm __volatile__
+        (
+         "jmp L9\n\t"
+         ".p2align 4,,7\n\t"
+         "L11:\n\t"
+         "subl $4, %%edi\n\t"
+         "L9:\n\t"
+         "movl (%%edi),%%ecx\n\t"
+         "cmpl %%ebx,%%ecx\n\t"
+         "jne L6\n\t"
+         "L10:\n\t"
+         "movl %%eax,(%%edi)\n\t"
+         "cmpl %%edi, %%edx\n\t"
+         "jne L11\n\t"
+         "jmp L7\n\t"
+
+         "L6:\n\t"
+         "pushq %%rdx\n\t"
+         "pushq %%rax\n\t"
+         "pushq %%rcx\n\t"
+         "pushq %%rbx\n\t"
+         "pushq %%rdi\n\t"
+         "call mt86_error\n\t"
+         "popq %%rdi\n\t"
+         "popq %%rbx\n\t"
+         "popq %%rcx\n\t"
+         "popq %%rax\n\t"
+         "popq %%rdx\n\t"
+         "jmp L10\n"
+
+         "L7:\n\t"
+         :: "a" (p1), "D" (p), "d" (pe), "b" (p2)
+         : "ecx"
+         );
+}
+
+/*
+ * Test all of memory using a "moving inversions" algorithm using the
+ * pattern in p1 and its complement in p2.
+ */
+void movinv1 (int iter, ulong p1, ulong p2, int me)
+{
+    int i;
+
+    /* Display the current pattern */
+    if (mstr_cpu == me) hprint(LINE_PAT, COL_PAT, p1);
+
+    movinv1_ctx ctx;
+    ctx.p1 = p1;
+    ctx.p2 = p2;
+    sliced_foreach_segment(&ctx, me, movinv1_init);
+    { BAILR }
+
+    /* Do moving inversions test. Check for initial pattern and then
+     * write the complement for each memory location. Test from bottom
+     * up and then from the top down.  */
+    for (i=0; i<iter; i++) {
+        sliced_foreach_segment(&ctx, me, movinv1_bottom_up);
+        { BAILR }
+
+        // NOTE(jcoiner):
+        // For the top-down pass, the original 5.01 code iterated over
+        // 'segs' in from n-1 down to 0, and then within each mapped segment,
+        // it would form the SPINSZ windows from the top down -- thus forming
+        // a different set of windows than the bottom-up pass, when the segment
+        // is not an integer number of windows.
+        //
+        // My guess is that this buys us very little additional coverage, that
+        // the value in going top-down happens at the word or cache-line level
+        // and that there's little to be gained from reversing the direction of
+        // the outer loops. So I'm leaving a 'direction' bit off of the
+        // foreach_segment() routines for now.
+        sliced_foreach_segment(&ctx, me, movinv1_top_down);
+        { BAILR }
+    }
+}
+
+typedef struct {
+    ulong p1;
+    ulong lb;
+    ulong hb;
+    int sval;
+    int off;
+} movinv32_ctx;
+
+STATIC void movinv32_init(ulong* restrict buf,
+                          ulong len_dw, const void* vctx) {
+    const movinv32_ctx* restrict ctx = (const movinv32_ctx*)vctx;
+
+    ulong* p = buf;
+    ulong* pe = buf + (len_dw - 1);
+
+    int k = ctx->off;
+    ulong pat = ctx->p1;
+    ulong lb = ctx->lb;
+    int sval = ctx->sval;
+
+    /* Original C code replaced with hand tuned assembly code
+     *			while (p <= pe) {
+     *				*p = pat;
+     *				if (++k >= 32) {
+     *					pat = lb;
+     *					k = 0;
+     *				} else {
+     *					pat = pat << 1;
+     *					pat |= sval;
+     *				}
+     *				p++;
+     *			}
+     */
+    asm __volatile__
+        (
+         "jmp L20\n\t"
+         ".p2align 4,,7\n\t"
+         "L923:\n\t"
+         "addl $4,%%edi\n\t"
+         "L20:\n\t"
+         "movl %%ecx,(%%edi)\n\t"
+         "addl $1,%%ebx\n\t"
+         "cmpl $32,%%ebx\n\t"
+         "jne L21\n\t"
+         "movl %%esi,%%ecx\n\t"
+         "xorl %%ebx,%%ebx\n\t"
+         "jmp L22\n"
+         "L21:\n\t"
+         "shll $1,%%ecx\n\t"
+         "orl %%eax,%%ecx\n\t"
+         "L22:\n\t"
+         "cmpl %%edx,%%edi\n\t"
+         "jb L923\n\t"
+         :: "D" (p),"d" (pe),"b" (k),"c" (pat),
+           "a" (sval), "S" (lb)
+         );
+}
+
+STATIC void movinv32_bottom_up(ulong* restrict buf, ulong len_dw,
+                               const void* vctx) {
+    const movinv32_ctx* restrict ctx = (const movinv32_ctx*)vctx;
+
+    ulong* p = buf;
+    ulong* pe = buf + (len_dw - 1);
+
+    int k = ctx->off;
+    ulong pat = ctx->p1;
+    ulong lb = ctx->lb;
+    int sval = ctx->sval;
+
+    /* Original C code replaced with hand tuned assembly code
+     *				while (1) {
+     *					if ((bad=*p) != pat) {
+     *						mt86_error((ulong*)p, pat, bad);
+     *					}
+     *					*p = ~pat;
+     *					if (p >= pe) break;
+     *					p++;
+     *
+     *					if (++k >= 32) {
+     *						pat = lb;
+     *						k = 0;
+     *					} else {
+     *						pat = pat << 1;
+     *						pat |= sval;
+     *					}
+     *				}
+     */
+    asm __volatile__
+        (
+         "pushq %%rbp\n\t"
+         "jmp L30\n\t"
+         ".p2align 4,,7\n\t"
+         "L930:\n\t"
+         "addl $4,%%edi\n\t"
+         "L30:\n\t"
+         "movl (%%edi),%%ebp\n\t"
+         "cmpl %%ecx,%%ebp\n\t"
+         "jne L34\n\t"
+
+         "L35:\n\t"
+         "notl %%ecx\n\t"
+         "movl %%ecx,(%%edi)\n\t"
+         "notl %%ecx\n\t"
+         "incl %%ebx\n\t"
+         "cmpl $32,%%ebx\n\t"
+         "jne L31\n\t"
+         "movl %%esi,%%ecx\n\t"
+         "xorl %%ebx,%%ebx\n\t"
+         "jmp L32\n"
+         "L31:\n\t"
+         "shll $1,%%ecx\n\t"
+         "orl %%eax,%%ecx\n\t"
+         "L32:\n\t"
+         "cmpl %%edx,%%edi\n\t"
+         "jb L930\n\t"
+         "jmp L33\n\t"
+
+         "L34:\n\t"
+         "pushq %%rsi\n\t"
+         "pushq %%rax\n\t"
+         "pushq %%rbx\n\t"
+         "pushq %%rdx\n\t"
+         "pushq %%rbp\n\t"
+         "pushq %%rcx\n\t"
+         "pushq %%rdi\n\t"
+         "call mt86_error\n\t"
+         "popq %%rdi\n\t"
+         "popq %%rcx\n\t"
+         "popq %%rbp\n\t"
+         "popq %%rdx\n\t"
+         "popq %%rbx\n\t"
+         "popq %%rax\n\t"
+         "popq %%rsi\n\t"
+         "jmp L35\n"
+
+         "L33:\n\t"
+         "popq %%rbp\n\t"
+         : "=b" (k),"=c" (pat)
+         : "D" (p),"d" (pe),"b" (k),"c" (pat),
+           "a" (sval), "S" (lb)
+         );
+}
+
+STATIC void movinv32_top_down(ulong* restrict buf,
+                              ulong len_dw, const void* vctx) {
+    const movinv32_ctx* restrict ctx = (const movinv32_ctx*)vctx;
+
+    ulong* pe = buf;
+    ulong* p = buf + (len_dw - 1);
+
+    int k = ctx->off;
+    ulong pat = ctx->p1;
+    ulong hb = ctx->hb;
+    int sval = ctx->sval;
+    ulong p3 = (ulong)sval << 31;
+
+    // Advance 'k' and 'pat' to where they would have been
+    // at the end of the corresponding bottom_up segment.
+    //
+    // The '-1' is because we didn't advance 'k' or 'pat'
+    // on the final bottom_up loop, so they're off by one...
+    ulong mod_len = (len_dw - 1) % 32;
+    for (int i = 0; i < mod_len; i++) {
+        if (++k >= 32) {
+            pat = ctx->lb;
+            k = 0;
+        } else {
+            pat = pat << 1;
+            pat |= sval;
+        }
+    }
+
+    // Increment 'k' only because the code below has an off-by-one
+    // interpretation of 'k' relative to the bottom_up routine.
+    // There it ranges from 0:31, and here it ranges from 1:32.
+    k++;
+
+    /* Original C code replaced with hand tuned assembly code */
+#if PREFER_C
+    ulong bad;
+    while(1) {
+        if ((bad=*p) != ~pat) {
+            mt86_error((ulong*)p, ~pat, bad);
+        }
+        *p = pat;
+        if (p <= pe) break;
+        p--;
+
+        if (--k <= 0) {
+            k = 32;
+            pat = hb;
+        } else {
+            pat = pat >> 1;
+            pat |= p3;
+        }
+    };
+#else
+    asm __volatile__
+        (
+         "pushq %%rbp\n\t"
+         "jmp L40\n\t"
+         ".p2align 4,,7\n\t"
+         "L49:\n\t"
+         "subl $4,%%edi\n\t"
+         "L40:\n\t"
+         "movl (%%edi),%%ebp\n\t"
+         "notl %%ecx\n\t"
+         "cmpl %%ecx,%%ebp\n\t"
+         "jne L44\n\t"
+
+         "L45:\n\t"
+         "notl %%ecx\n\t"
+         "movl %%ecx,(%%edi)\n\t"
+         "decl %%ebx\n\t"
+         "cmpl $0,%%ebx\n\t"
+         "jg L41\n\t"
+         "movl %%esi,%%ecx\n\t"
+         "movl $32,%%ebx\n\t"
+         "jmp L42\n"
+         "L41:\n\t"
+         "shrl $1,%%ecx\n\t"
+         "orl %%eax,%%ecx\n\t"
+         "L42:\n\t"
+         "cmpl %%edx,%%edi\n\t"
+         "ja L49\n\t"
+         "jmp L43\n\t"
+
+         "L44:\n\t"
+         "pushq %%rsi\n\t"
+         "pushq %%rax\n\t"
+         "pushq %%rbx\n\t"
+         "pushq %%rdx\n\t"
+         "pushq %%rbp\n\t"
+         "pushq %%rcx\n\t"
+         "pushq %%rdi\n\t"
+         "call mt86_error\n\t"
+         "popq %%rdi\n\t"
+         "popq %%rcx\n\t"
+         "popq %%rbp\n\t"
+         "popq %%rdx\n\t"
+         "popq %%rbx\n\t"
+         "popq %%rax\n\t"
+         "popq %%rsi\n\t"
+         "jmp L45\n"
+
+         "L43:\n\t"
+         "popq %%rbp\n\t"
+         : : "D" (p),"d" (pe),"b" (k),"c" (pat),
+           "a" (p3), "S" (hb)
+         );
+#endif
+}
+
+void movinv32(int iter, ulong p1, ulong lb, ulong hb, int sval, int off,int me)
+{
+    // First callsite:
+    //  - p1 has 1 bit set (somewhere)
+    //  - lb = 1 ("low bit")
+    //  - hb = 0x80000000 ("high bit")
+    //  - sval = 0
+    //  - 'off' indicates the position of the set bit in p1
+    //
+    // Second callsite is the same, but inverted:
+    //  - p1 has 1 bit clear (somewhere)
+    //  - lb = 0xfffffffe
+    //  - hb = 0x7fffffff
+    //  - sval = 1
+    //  - 'off' indicates the position of the cleared bit in p1
+
+    movinv32_ctx ctx;
+    ctx.p1 = p1;
+    ctx.lb = lb;
+    ctx.hb = hb;
+    ctx.sval = sval;
+    ctx.off = off;
+
+    /* Display the current pattern */
+    if (mstr_cpu == me) hprint(LINE_PAT, COL_PAT, p1);
+
+    sliced_foreach_segment(&ctx, me, movinv32_init);
+    { BAILR }
+
+    /* Do moving inversions test. Check for initial pattern and then
+     * write the complement for each memory location. Test from bottom
+     * up and then from the top down.  */
+    for (int i=0; i<iter; i++) {
+        sliced_foreach_segment(&ctx, me, movinv32_bottom_up);
+        { BAILR }
+
+        sliced_foreach_segment(&ctx, me, movinv32_top_down);
+        { BAILR }
+    }
+}
+
+typedef struct {
+    int offset;
+    ulong p1;
+    ulong p2;
+} modtst_ctx;
+
+STATIC void modtst_sparse_writes(ulong* restrict start,
+                                 ulong len_dw, const void* vctx) {
+    const modtst_ctx* restrict ctx = (const modtst_ctx*)vctx;
+    ulong p1 = ctx->p1;
+    ulong offset = ctx->offset;
+
+#if PREFER_C
+    for (ulong i = offset; i < len_dw; i += MOD_SZ) {
+        start[i] = p1;
+    }
+#else
+    ulong* p = start + offset;
+    ulong* pe = start + len_dw;
+    asm __volatile__
+        (
+         "jmp L60\n\t"
+         ".p2align 4,,7\n\t"
+
+         "L60:\n\t"
+         "movl %%eax,(%%edi)\n\t"
+         "addl $80,%%edi\n\t"
+         "cmpl %%edx,%%edi\n\t"
+         "jb L60\n\t"
+         :: "D" (p), "d" (pe), "a" (p1)
+         );
+#endif
+}
+
+STATIC void modtst_dense_writes(ulong* restrict start, ulong len_dw,
+                                const void* vctx) {
+    const modtst_ctx* restrict ctx = (const modtst_ctx*)vctx;
+    ulong p2 = ctx->p2;
+    ulong offset = ctx->offset;
+
+    ASSERT(offset < MOD_SZ);
+
+    ulong k = 0;
+#if PREFER_C
+    for (ulong i = 0; i < len_dw; i++) {
+        if (k != offset) {
+            start[i] = p2;
+        }
+        if (++k >= MOD_SZ) {
+            k = 0;
+        }
+    }
+#else
+    ulong* pe = start + (len_dw - 1);
+    asm __volatile__
+        (
+         "jmp L50\n\t"
+         ".p2align 4,,7\n\t"
+
+         "L54:\n\t"
+         "addl $4,%%edi\n\t"
+         "L50:\n\t"
+         "cmpl %%ebx,%%ecx\n\t"
+         "je L52\n\t"
+         "movl %%eax,(%%edi)\n\t"
+         "L52:\n\t"
+         "incl %%ebx\n\t"
+         "cmpl $19,%%ebx\n\t"
+         "jle L53\n\t"
+         "xorl %%ebx,%%ebx\n\t"
+         "L53:\n\t"
+         "cmpl %%edx,%%edi\n\t"
+         "jb L54\n\t"
+         : : "D" (start), "d" (pe), "a" (p2),
+           "b" (k), "c" (offset)
+         );
+#endif
+}
+
+STATIC void modtst_check(ulong* restrict start,
+                         ulong len_dw, const void* vctx) {
+    const modtst_ctx* restrict ctx = (const modtst_ctx*)vctx;
+    ulong p1 = ctx->p1;
+    ulong offset = ctx->offset;
+
+    ASSERT(offset < MOD_SZ);
+
+#if PREFER_C
+    ulong bad;
+    for (ulong i = offset; i < len_dw; i += MOD_SZ) {
+        if ((bad = start[i]) != p1)
+            mt86_error(start + i, p1, bad);
+    }
+#else
+    ulong* p = start + offset;
+    ulong* pe = start + len_dw;
+    asm __volatile__
+        (
+         "jmp L70\n\t"
+         ".p2align 4,,7\n\t"
+
+         "L70:\n\t"
+         "movl (%%edi),%%ecx\n\t"
+         "cmpl %%eax,%%ecx\n\t"
+         "jne L71\n\t"
+         "L72:\n\t"
+         "addl $80,%%edi\n\t"
+         "cmpl %%edx,%%edi\n\t"
+         "jb L70\n\t"
+         "jmp L73\n\t"
+
+         "L71:\n\t"
+         "pushq %%rdx\n\t"
+         "pushq %%rcx\n\t"
+         "pushq %%rax\n\t"
+         "pushq %%rdi\n\t"
+         "call mt86_error\n\t"
+         "popq %%rdi\n\t"
+         "popq %%rax\n\t"
+         "popq %%rcx\n\t"
+         "popq %%rdx\n\t"
+         "jmp L72\n"
+
+         "L73:\n\t"
+         : : "D" (p), "d" (pe), "a" (p1)
+         : "ecx"
+         );
+#endif
+}
+
+/*
+ * Test all of memory using modulo X access pattern.
+ */
+void modtst(int offset, int iter, ulong p1, ulong p2, int me)
+{
+    modtst_ctx ctx;
+    ctx.offset = offset;
+    ctx.p1 = p1;
+    ctx.p2 = p2;
+
+    /* Display the current pattern */
+    if (mstr_cpu == me) {
+        hprint(LINE_PAT, COL_PAT-2, p1);
+        cprint(LINE_PAT, COL_PAT+6, "-");
+        dprint(LINE_PAT, COL_PAT+7, offset, 2, 1);
+    }
+
+    /* Write every nth location with pattern */
+    sliced_foreach_segment(&ctx, me, modtst_sparse_writes);
+    { BAILR }
+
+    /* Write the rest of memory "iter" times with the pattern complement */
+    for (ulong i=0; i<iter; i++) {
+        sliced_foreach_segment(&ctx, me, modtst_dense_writes);
+        { BAILR }
+    }
+
+    /* Now check every nth location */
+    sliced_foreach_segment(&ctx, me, modtst_check);
+}
+
+#if PREFER_C
+
+STATIC void movsl(ulong* dest,
+           ulong* src,
+           ulong size_in_dwords) {
+    /* Logically equivalent to:
+
+    for (ulong i = 0; i < size_in_dwords; i++)
+        dest[i] = src[i];
+
+    However: the movsl instruction does the entire loop
+    in one instruction -- this is probably how 'memcpy'
+    is implemented -- so hardware makes it very fast.
+
+    Even in PREFER_C mode, we want the brute force of movsl!
+    */
+    asm __volatile__
+        (
+         "cld\n"
+         "jmp L1189\n\t"
+
+         ".p2align 4,,7\n\t"
+         "L1189:\n\t"
+
+         "movl %1,%%edi\n\t" // dest
+         "movl %0,%%esi\n\t" // src
+         "movl %2,%%ecx\n\t" // len in dwords
+         "rep\n\t"
+         "movsl\n\t"
+
+         :: "g" (src), "g" (dest), "g" (size_in_dwords)
+         : "edi", "esi", "ecx"
+         );
+}
+#endif  // PREFER_C
+
+STATIC ulong block_move_normalize_len_dw(ulong len_dw) {
+    // The block_move test works with sets of 64-byte blocks,
+    // so ensure our total length is a multiple of 64.
+    //
+    // In fact, since we divide the region in half, and each half-region
+    // is a set of 64-byte blocks, the full region should be a multiple of 128
+    // bytes.
+    //
+    // Note that there's no requirement for the start address of the region to
+    // be 64-byte aligned, it can be any dword.
+    ulong result = (len_dw >> 5) << 5;
+    ASSERT(result > 0);
+    return result;
+}
+
+STATIC void block_move_init(ulong* restrict buf,
+                            ulong len_dw, const void* unused_ctx) {
+    len_dw = block_move_normalize_len_dw(len_dw);
+
+    // Compute 'len' in units of 64-byte chunks:
+    ulong len = len_dw >> 4;
+
+    // We only need to initialize len/2, since we'll just copy
+    // the first half onto the second half in the move step.
+    len = len >> 1;
+
+    ulong base_val = 1;
+#if PREFER_C
+    while(len > 0) {
+        ulong neg_val = ~base_val;
+
+        // Set a block of 64 bytes   //   first block DWORDS are:
+        buf[0] = base_val;             //   0x00000001
+        buf[1] = base_val;             //   0x00000001
+        buf[2] = base_val;             //   0x00000001
+        buf[3] = base_val;             //   0x00000001
+        buf[4] = neg_val;              //   0xfffffffe
+        buf[5] = neg_val;              //   0xfffffffe
+        buf[6] = base_val;             //   0x00000001
+        buf[7] = base_val;             //   0x00000001
+        buf[8] = base_val;             //   0x00000001
+        buf[9] = base_val;             //   0x00000001
+        buf[10] = neg_val;             //   0xfffffffe
+        buf[11] = neg_val;             //   0xfffffffe
+        buf[12] = base_val;            //   0x00000001
+        buf[13] = base_val;            //   0x00000001
+        buf[14] = neg_val;             //   0xfffffffe
+        buf[15] = neg_val;             //   0xfffffffe
+
+        buf += 16;  // advance to next 64-byte block
+        len--;
+
+        // Rotate the bit left, including an all-zero state.
+        // It can't hurt to have a periodicity of 33 instead of
+        // a power of two.
+        if (base_val == 0) {
+            base_val = 1;
+        } else if (base_val & 0x80000000) {
+            base_val = 0;
+        } else {
+            base_val = base_val << 1;
+        }
+    }
+#else
+    asm __volatile__
+        (
+         "jmp L100\n\t"
+
+         ".p2align 4,,7\n\t"
+         "L100:\n\t"
+
+         // First loop eax is 0x00000001, edx is 0xfffffffe
+         "movl %%eax, %%edx\n\t"
+         "notl %%edx\n\t"
+
+         // Set a block of 64-bytes	// First loop DWORDS are 
+         "movl %%eax,0(%%edi)\n\t"	// 0x00000001
+         "movl %%eax,4(%%edi)\n\t"	// 0x00000001
+         "movl %%eax,8(%%edi)\n\t"	// 0x00000001
+         "movl %%eax,12(%%edi)\n\t"	// 0x00000001
+         "movl %%edx,16(%%edi)\n\t"	// 0xfffffffe
+         "movl %%edx,20(%%edi)\n\t"	// 0xfffffffe
+         "movl %%eax,24(%%edi)\n\t"	// 0x00000001
+         "movl %%eax,28(%%edi)\n\t"	// 0x00000001
+         "movl %%eax,32(%%edi)\n\t"	// 0x00000001
+         "movl %%eax,36(%%edi)\n\t"	// 0x00000001
+         "movl %%edx,40(%%edi)\n\t"	// 0xfffffffe
+         "movl %%edx,44(%%edi)\n\t"	// 0xfffffffe
+         "movl %%eax,48(%%edi)\n\t"	// 0x00000001
+         "movl %%eax,52(%%edi)\n\t"	// 0x00000001
+         "movl %%edx,56(%%edi)\n\t"	// 0xfffffffe
+         "movl %%edx,60(%%edi)\n\t"	// 0xfffffffe
+
+         // rotate left with carry, 
+         // second loop eax is		 0x00000002
+         // second loop edx is (~eax) 0xfffffffd
+         "rcll $1, %%eax\n\t"		
+			
+         // Move current position forward 64-bytes (to start of next block)
+         "leal 64(%%edi), %%edi\n\t"
+
+         // Loop until end
+         "decl %%ecx\n\t"
+         "jnz  L100\n\t"
+
+         : : "D" (buf), "c" (len), "a" (base_val)
+         : "edx"
+         );
+#endif
+}
+
+typedef struct {
+    int iter;
+    int me;
+} block_move_ctx;
+
+STATIC void block_move_move(ulong* restrict buf,
+                            ulong len_dw, const void* vctx) {
+    const block_move_ctx* restrict ctx = (const block_move_ctx*)vctx;
+    ulong iter = ctx->iter;
+    int me = ctx->me;
+
+    len_dw = block_move_normalize_len_dw(len_dw);
+
+    /* Now move the data around 
+     * First move the data up half of the segment size we are testing
+     * Then move the data to the original location + 32 bytes
+     */
+    ulong half_len_dw = len_dw / 2; // Half the size of this block in DWORDS
+    ASSERT(half_len_dw > 8);
+
+    // TODO ulong* mid = buf + half_len_dw;    // VA at mid-point of this block.
+    for (int i=0; i<iter; i++) {
+        if (i > 0) {
+            // foreach_segment() called this before the 0th iteration,
+            // so don't tick twice in quick succession.
+            do_tick(me);
+        }
+        { BAILR }
+
+#if PREFER_C
+        // Move first half to 2nd half:
+        movsl(/*dest=*/ mid, /*src=*/ buf, half_len_dw);
+
+        // Move the second half, less the last 8 dwords
+        // to the first half plus an offset of 8 dwords.
+        movsl(/*dest=*/ buf + 8, /*src=*/ mid, half_len_dw - 8);
+
+        // Finally, move the last 8 dwords of the 2nd half
+        // to the first 8 dwords of the first half.
+        movsl(/*dest=*/ mid + half_len_dw - 8, /*src=*/ buf, 8);
+#else
+   /*     asm __volatile__ // TODO
+            (
+             "cld\n"
+             "jmp L110\n\t"
+
+             ".p2align 4,,7\n\t"
+             "L110:\n\t"
+
+             //
+             // At the end of all this 
+             // - the second half equals the inital value of the first half
+             // - the first half is right shifted 32-bytes (with wrapping)
+             //
+
+             // Move first half to second half
+             "movl %1,%%edi\n\t" // Destination 'mid' (mid point)
+             "movl %0,%%esi\n\t" // Source, 'buf' (start point)
+             "movl %2,%%ecx\n\t" // Length, 'half_len_dw' (size of a half in DWORDS)
+             "rep\n\t"
+             "movsl\n\t"
+
+             // Move the second half, less the last 32-bytes. To the first half, offset plus 32-bytes
+             "movl %0,%%edi\n\t"
+             "addl $32,%%edi\n\t"   // Destination 'buf' plus 32 bytes
+             "movl %1,%%esi\n\t"    // Source, 'mid'
+             "movl %2,%%ecx\n\t"
+             "subl $8,%%ecx\n\t"    // Length, 'half_len_dw'
+             "rep\n\t"
+             "movsl\n\t"
+
+             // Move last 8 DWORDS (32-bytes) of the second half to the start of the first half
+             "movl %0,%%edi\n\t"    // Destination 'buf'
+             // Source, 8 DWORDS from the end of the second half, left over by the last rep/movsl
+             "movl $8,%%ecx\n\t"    // Length, 8 DWORDS (32-bytes)
+             "rep\n\t"
+             "movsl\n\t"
+
+             :: "g" (buf), "g" (mid), "g" (half_len_dw)
+             : "edi", "esi", "ecx"
+             );*/
+#endif        
+    }
+}
+
+STATIC void block_move_check(ulong* restrict buf,
+                             ulong len_dw, const void* unused_ctx) {
+    len_dw = block_move_normalize_len_dw(len_dw);
+
+    /* Now check the data.
+     * This is rather crude, we just check that the
+     * adjacent words are the same.
+     */
+#if PREFER_C
+    for (ulong i = 0; i < len_dw; i = i + 2) {
+        if (buf[i] != buf[i+1]) {
+            mt86_error(buf+i, buf[i], buf[i+1]);
+        }
+    }
+#else
+    ulong* pe = buf + (len_dw - 2);
+    asm __volatile__
+        (
+         "jmp L120\n\t"
+
+         ".p2align 4,,7\n\t"
+         "L124:\n\t"
+         "addl $8,%%edi\n\t" // Next QWORD
+         "L120:\n\t"
+
+         // Compare adjacent DWORDS
+         "movl (%%edi),%%ecx\n\t"
+         "cmpl 4(%%edi),%%ecx\n\t"
+         "jnz L121\n\t" // Print error if they don't match
+
+         // Loop until end of block
+         "L122:\n\t"
+         "cmpl %%edx,%%edi\n\t"
+         "jb L124\n"
+         "jmp L123\n\t"
+
+         "L121:\n\t"
+         // eax not used so we don't need to save it as per cdecl
+         // ecx is used but not restored, however we don't need it's value anymore after this point
+         "pushq %%rdx\n\t"
+         "pushq 4(%%edi)\n\t"
+         "pushq %%rcx\n\t"
+         "pushq %%rdi\n\t"
+         "call mt86_error\n\t"
+         "popq %%rdi\n\t"
+         "addl $8,%%esp\n\t"
+         "popq %%rdx\n\t"
+         "jmp L122\n"
+         "L123:\n\t"
+         :: "D" (buf), "d" (pe)
+         : "ecx"
+         );
+#endif
+}
+
+/*
+ * Test memory using block moves 
+ * Adapted from Robert Redelmeier's burnBX test
+ */
+void block_move(int iter, int me)
+{
+    cprint(LINE_PAT, COL_PAT-2, "          ");
+
+    block_move_ctx ctx;
+    ctx.iter = iter;
+    ctx.me = me;
+
+    /* Initialize memory with the initial pattern.  */
+    sliced_foreach_segment(&ctx, me, block_move_init);
+    { BAILR }
+    s_barrier();
+
+    /* Now move the data around */
+    sliced_foreach_segment(&ctx, me, block_move_move);
+    { BAILR }
+    s_barrier();
+
+    /* And check it. */
+    sliced_foreach_segment(&ctx, me, block_move_check);
+}
+
+typedef struct {
+    ulong pat;
+} bit_fade_ctx;
+
+STATIC void bit_fade_fill_seg(ulong* restrict p,
+                              ulong len_dw, const void* vctx) {
+    const bit_fade_ctx* restrict ctx = (const bit_fade_ctx*)vctx;
+    ulong pat = ctx->pat;
+
+    for (ulong i = 0; i < len_dw; i++) {
+        p[i] = pat;
+    }
+}
+
+/*
+ * Test memory for bit fade, fill memory with pattern.
+ */
+void bit_fade_fill(ulong p1, int me)
+{
+    /* Display the current pattern */
+    hprint(LINE_PAT, COL_PAT, p1);
+
+    /* Initialize memory with the initial pattern.  */
+    bit_fade_ctx ctx;
+    ctx.pat = p1;
+    unsliced_foreach_segment(&ctx, me, bit_fade_fill_seg);
+}
+
+STATIC void bit_fade_chk_seg(ulong* restrict p,
+                             ulong len_dw, const void* vctx) {
+    const bit_fade_ctx* restrict ctx = (const bit_fade_ctx*)vctx;
+    ulong pat = ctx->pat;
+
+    for (ulong i = 0; i < len_dw; i++) {
+        ulong bad;
+        if ((bad=p[i]) != pat) {
+            mt86_error(p+i, pat, bad);
+        }
+    }
+}
+
+void bit_fade_chk(ulong p1, int me)
+{
+    bit_fade_ctx ctx;
+    ctx.pat = p1;
+
+    /* Make sure that nothing changed while sleeping */
+    unsliced_foreach_segment(&ctx, me, bit_fade_chk_seg);
+}
+
+/* Sleep for N seconds */
+void sleep(long n, int flag, int me,
+           int sms /* interpret 'n' as milliseconds instead */)
+{
+    ulong sh, sl, l, h, t, ip=0;
+
+    /* save the starting time */
+    asm __volatile__(
+                     "rdtsc":"=a" (sl),"=d" (sh));
+
+    /* loop for n seconds */
+    while (1) {
+        /*asm __volatile__(
+                         "rep ; nop\n\t"
+                         "rdtsc":"=a" (l),"=d" (h));
+        asm __volatile__ (
+                          "subl %2,%0\n\t"
+                          "sbbl %3,%1"
+                          :"=a" (l), "=d" (h)
+                          :"g" (sl), "g" (sh),
+                           "0" (l), "1" (h));*/
+
+        h = 1; // TODO remove
+        l = 1; // TODO remove
+        if (sms != 0) {
+            t = h * ((unsigned)0xffffffff / vv->clks_msec);
+            t += (l / vv->clks_msec);
+        } else {
+            t = h * ((unsigned)0xffffffff / vv->clks_msec) / 1000;
+            t += (l / vv->clks_msec) / 1000;
+        }
+
+        /* Is the time up? */
+        if (t >= n) {
+            break;
+        }
+
+        /* Only display elapsed time if flag is set */
+        if (flag == 0) {
+            continue;
+        }
+
+        if (t != ip) {
+            do_tick(me);
+            { BAILR }
+            ip = t;
+        }
+    }
+}
+
+void beep(unsigned int frequency)
+{
+#if 0
+    // BOZO(jcoiner)
+    // Removed this, we need to define outb_p() and inb_p()
+    // before reintroducing it.
+#else
+    unsigned int count = 1193180 / frequency;
+
+    // Switch on the speaker
+    outb_p(inb_p(0x61)|3, 0x61);
+
+    // Set command for counter 2, 2 byte write
+    outb_p(0xB6, 0x43);
+
+    // Select desired Hz
+    outb_p(count & 0xff, 0x42);
+    outb((count >> 8) & 0xff, 0x42);
+
+    // Block for 100 microseconds
+    sleep(100, 0, 0, 1);
+
+    // Switch off the speaker
+    outb(inb_p(0x61)&0xFC, 0x61);
+#endif
+}
diff --git a/efi_memtest/memtest86+/efi/test_cache.h b/efi_memtest/memtest86+/efi/test_cache.h
new file mode 100644
index 0000000..a6ea496
--- /dev/null
+++ b/efi_memtest/memtest86+/efi/test_cache.h
@@ -0,0 +1,20 @@
+static inline void cache_off(void)
+{
+    asm(
+        "push %rax\n\t"
+        "movq %cr0,%eax\n\t"
+        "orl $0x40000000,%eax\n\t"  /* Set CD */
+        "movq %eax,%cr0\n\t"
+        "wbinvd\n\t"
+        "pop  %rax\n\t");
+}
+
+static inline void cache_on(void)
+{
+    asm(
+        "push %rax\n\t"
+        "movq %cr0,%eax\n\t"
+        "andl $0x9fffffff,%eax\n\t" /* Clear CD and NW */ 
+        "movq %eax,%cr0\n\t"
+        "pop  %rax\n\t");
+}
diff --git a/efi_memtest/memtest86+/efi/vmem.c b/efi_memtest/memtest86+/efi/vmem.c
new file mode 100644
index 0000000..80e69d2
--- /dev/null
+++ b/efi_memtest/memtest86+/efi/vmem.c
@@ -0,0 +1,159 @@
+/* vmem.c - MemTest-86 
+ *
+ * Virtual memory handling (PAE)
+ *
+ * Released under version 2 of the Gnu Public License.
+ * By Chris Brady
+ */
+#include "stdint.h"
+#include "test.h"
+#include "cpuid.h"
+
+extern struct cpu_ident cpu_id;
+
+static unsigned long mapped_win = 1;
+void paging_off(void)
+{
+    if (!cpu_id.fid.bits.pae)
+        return;
+/*    __asm__ __volatile__
+        (
+         // Disable paging         
+         "movl %%cr0, %%eax\n\t"
+         "andl $0x7FFFFFFF, %%eax\n\t"
+         "movl %%eax, %%cr0\n\t"
+         : :
+         : "ax"
+         );*/
+}
+
+static void paging_on(void *pdp)
+{
+    if (!cpu_id.fid.bits.pae)
+        return;
+/*    __asm__ __volatile__
+        (
+         // Load the page table address 
+         "movl %0, %%cr3\n\t"
+         // Enable paging 
+         "movl %%cr0, %%eax\n\t"
+         "orl $0x80000000, %%eax\n\t"
+         "movl %%eax, %%cr0\n\t"
+         :
+         : "r" (pdp)
+         : "ax"
+         );*/
+}
+
+static void paging_on_lm(void *pml)
+{
+/*    if (!cpu_id.fid.bits.pae)
+        return;
+    __asm__ __volatile__
+        (
+         // Load the page table address
+         "movl %0, %%cr3\n\t"
+         // Enable paging 
+         "movl %%cr0, %%eax\n\t"
+         "orl $0x80000000, %%eax\n\t"
+         "movl %%eax, %%cr0\n\t"
+         :
+         : "r" (pml)
+         : "ax"
+         );*/
+}
+
+int map_page(unsigned long page)
+{
+    unsigned long i;
+    struct pde {
+        unsigned long addr_lo;
+        unsigned long addr_hi;
+    };
+    extern unsigned char pdp[];
+    extern unsigned char pml4[];
+    extern struct pde pd2[];
+    unsigned long win = page >> 19;
+
+    /* Less than 2 GB so no mapping is required */
+    if (win == 0) {
+        return 0;
+    }
+    if (cpu_id.fid.bits.pae == 0) {
+        /* Fail, we don't have PAE */
+        return -1;
+    }
+    if (cpu_id.fid.bits.lm == 0 && (page > 0x1000000)) {
+        /* Fail, we want an address that is out of bounds (> 64GB)
+         *  for PAE and no long mode (ie. 32 bit CPU).
+         */
+        return -1;
+    }
+    /* Compute the page table entries... */
+    for(i = 0; i < 1024; i++) {
+        /*-----------------10/30/2004 12:37PM---------------
+         * 0xE3 --
+         * Bit 0 = Present bit.      1 = PDE is present
+         * Bit 1 = Read/Write.       1 = memory is writable
+         * Bit 2 = Supervisor/User.  0 = Supervisor only (CPL 0-2)
+         * Bit 3 = Writethrough.     0 = writeback cache policy
+         * Bit 4 = Cache Disable.    0 = page level cache enabled
+         * Bit 5 = Accessed.         1 = memory has been accessed.
+         * Bit 6 = Dirty.            1 = memory has been written to.
+         * Bit 7 = Page Size.        1 = page size is 2 MBytes
+         * --------------------------------------------------*/
+        pd2[i].addr_lo = ((win & 1) << 31) + ((i & 0x3ff) << 21) + 0xE3;
+        pd2[i].addr_hi = (win >> 1);
+    }
+    paging_off();
+    if (cpu_id.fid.bits.lm == 1) {
+        paging_on_lm(pml4);
+    } else {
+        paging_on(pdp);
+    }
+    mapped_win = win;
+    return 0;
+}
+
+void *mapping(unsigned long phys_page)
+{
+    void *result;
+    if (phys_page < WIN_SZ_PAGES) {
+        /* If the page is below 2GB, address it directly */
+        result = (void *)(phys_page << 12);
+    }
+    else {
+        // Higher physical pages map to a virtual address
+        // in the 2G-4G range.
+        unsigned long alias;
+        alias = phys_page & 0x7FFFF;
+        alias += 0x80000;
+        result = (void *)(alias << 12);
+    }
+    return result;
+}
+
+void *emapping(unsigned long phys_page)
+{
+    void *result;
+    result = mapping(phys_page - 1);
+    /* Fill in the low address bits */
+    result = ((unsigned char *)result) + 0xffc;
+    return result;
+}
+
+unsigned long page_of(void *addr)
+{
+    unsigned long page;
+    page = ((unsigned long)addr) >> 12;
+    if (page >= 0x80000) {
+        page &= 0x7FFFF;
+        page += mapped_win << 19;
+    }
+#if 0
+    cprint(LINE_SCROLL -2, 0, "page_of(        )->            ");
+    hprint(LINE_SCROLL -2, 8, ((unsigned long)addr));
+    hprint(LINE_SCROLL -2, 20, page);
+#endif	
+    return page;
+}
diff --git a/efi_memtest/memtest86+/error.c b/efi_memtest/memtest86+/error.c
index 61afa1b..48a94ee 100644
--- a/efi_memtest/memtest86+/error.c
+++ b/efi_memtest/memtest86+/error.c
@@ -466,8 +466,8 @@ char spin[4] = {'|','/','-','\\'};
 
 void do_tick(int me)
 {
-    int i, j, pct;
-    ulong h, l, n, t;
+    int i, /*j,*/ pct;
+    ulong h, /*l,*/ n/*, t*/;
     extern int mstr_cpu;
 
     if (++spin_idx[me] > 3) {
@@ -588,7 +588,9 @@ void do_tick(int me)
     /* We can't do the elapsed time unless the rdtsc instruction
      * is supported
      */
-    if (cpu_id.fid.bits.rdtsc) {
+
+  // TODO
+ /*   if (cpu_id.fid.bits.rdtsc) {
         asm __volatile__(
                          "rdtsc":"=a" (l),"=d" (h));
         asm __volatile__ (
@@ -620,7 +622,7 @@ void do_tick(int me)
             }	
             vv->each_sec = j;	
         }	
-    }
+    }*/
 
     /* Poll for ECC errors */
     /*
diff --git a/efi_memtest/memtest86+/from main b/efi_memtest/memtest86+/from main
new file mode 100644
index 0000000..742e890
--- /dev/null
+++ b/efi_memtest/memtest86+/from main
@@ -0,0 +1,342 @@
+
+
+    /* First time (for this CPU) initialization */
+    if (start_seq < 2) {
+
+        /* These steps are only done by the boot cpu */
+        if (my_cpu_num == 0) {
+            my_cpu_ord = cpu_ord++;
+            smp_set_ordinal(my_cpu_num, my_cpu_ord);
+            parse_command_line();
+            clear_screen();
+            /* Initialize the barrier so the lock in btrace will work.
+             * Will get redone later when we know how many CPUs we have */
+            barrier_init(1);
+            btrace(my_cpu_num, __LINE__, "Begin     ", 1, 0, 0);
+            /* Find memory size */
+            mem_size();	/* must be called before initialise_cpus(); */
+            /* Fill in the CPUID table */
+            get_cpuid();
+            /* Startup the other CPUs */
+            start_seq = 1;
+            //initialise_cpus();
+            btrace(my_cpu_num, __LINE__, "BeforeInit", 1, 0, 0);
+            /* Draw the screen and get system information */
+            init();
+
+            /* Set defaults and initialize variables */
+            set_defaults();
+
+            /* Setup base address for testing, 1 MB */
+            win0_start = 0x100;
+
+            /* Set relocation address to 32Mb if there is enough
+             * memory. Otherwise set it to 3Mb */
+            /* Large reloc addr allows for more testing overlap */
+            if ((ulong)vv->pmap[vv->msegs-1].end > 0x2f00) {
+                high_test_adr = 0x2000000;
+            } else {
+                high_test_adr = 0x300000;
+            } 
+            win1_end = (high_test_adr >> 12);
+
+            /* Adjust the map to not test the page at 939k,
+             *  reserved for locks */
+            vv->pmap[0].end--;
+
+            find_ticks_for_pass();
+        } else {
+            /* APs only, Register the APs */
+            btrace(my_cpu_num, __LINE__, "AP_Start  ", 0, my_cpu_num,
+                   cpu_ord);
+            smp_ap_booted(my_cpu_num);
+            /* Asign a sequential CPU ordinal to each active cpu */
+            spin_lock(&barr->mutex);
+            my_cpu_ord = cpu_ord++;
+            smp_set_ordinal(my_cpu_num, my_cpu_ord);
+            spin_unlock(&barr->mutex);
+            btrace(my_cpu_num, __LINE__, "AP_Done   ", 0, my_cpu_num,
+                   my_cpu_ord);
+        }
+
+    } else {
+        /* Unlock after a relocation */
+        spin_unlock(&barr->mutex);
+        /* Get the CPU ordinal since it is lost during relocation */
+        my_cpu_ord = smp_my_ord_num(my_cpu_num);
+        btrace(my_cpu_num, __LINE__, "Reloc_Done",0,my_cpu_num,my_cpu_ord);
+    }
+
+    /* A barrier to insure that all of the CPUs are done with startup */
+    barrier();
+    btrace(my_cpu_num, __LINE__, "1st Barr  ", 1, my_cpu_num, my_cpu_ord);
+	
+
+    /* Setup Memory Management and measure memory speed, we do it here
+     * because we need all of the available CPUs */
+    if (start_seq < 2) {
+
+        /* Enable floating point processing */
+        enable_fp_processing();
+
+
+        btrace(my_cpu_num, __LINE__, "Mem Mgmnt ",
+               1, cpu_id.fid.bits.pae, cpu_id.fid.bits.lm);
+        /* Setup memory management modes */
+        setup_mm_modes();
+
+        /* Get the memory Speed with all CPUs */
+        get_mem_speed(my_cpu_num, num_cpus);
+    }
+
+    /* Set the initialized flag only after all of the CPU's have
+     * Reached the barrier. This insures that relocation has
+     * been completed for each CPU. */
+    btrace(my_cpu_num, __LINE__, "Start Done", 1, 0, 0);
+    start_seq = 2;
+
+    /* Loop through all tests */
+    while (1) {
+        /* If the restart flag is set all initial params */
+        if (restart_flag) {
+            set_defaults();
+            continue;
+        }
+        /* Skip single CPU tests if we are using only one CPU */
+        if (tseq[test].cpu_sel == -1 && 
+            (num_cpus == 1 || cpu_mode != CPM_ALL)) {
+            test++;
+            continue;
+        }
+
+        test_setup();
+
+        /* Loop through all possible windows */
+        while (win_next <= ((ulong)vv->pmap[vv->msegs-1].end + WIN_SZ_PAGES)) {
+
+            /* Main scheduling barrier */
+            cprint(8, my_cpu_num+7, "W");
+            btrace(my_cpu_num, __LINE__, "Sched_Barr", 1,window,win_next);
+            barrier();
+
+            /* Don't go over the 8TB PAE limit */
+            if (win_next > MAX_MEM_PAGES) {
+                break;
+            }
+
+            /* For the bit fade test, #11, we cannot relocate so bump the
+             * window to 1 */
+            if (tseq[test].pat == 11 && window == 0) {
+                window = 1;
+            }
+
+            /* Relocate if required */
+            if (window != 0 && (ulong)&_start != LOW_TEST_ADR) {
+                btrace(my_cpu_num, __LINE__, "Sched_RelL", 1,0,0);
+                run_at(LOW_TEST_ADR, my_cpu_num);
+            }
+            if (window == 0 && vv->plim_lower >= win0_start) {
+                window++;
+            }
+            if (window == 0 && (ulong)&_start == LOW_TEST_ADR) {
+                btrace(my_cpu_num, __LINE__, "Sched_RelH", 1,0,0);
+                run_at(high_test_adr, my_cpu_num);
+            }
+
+            /* Decide which CPU(s) to use */
+            btrace(my_cpu_num, __LINE__, "Sched_CPU0",1,cpu_sel,
+                   tseq[test].cpu_sel);
+            run = 1;
+            switch(cpu_mode) {
+            case CPM_RROBIN:
+            case CPM_SEQ:
+                /* Select a single CPU */
+                if (my_cpu_ord == cpu_sel) {
+                    mstr_cpu = cpu_sel;
+                    run_cpus = 1;
+                } else {
+                    run = 0;
+                }
+                break;
+            case CPM_ALL:
+                /* Use all CPUs */
+                if (tseq[test].cpu_sel == -1) {
+                    /* Round robin through all of the CPUs */
+                    if (my_cpu_ord == cpu_sel) {
+                        mstr_cpu = cpu_sel;
+                        run_cpus = 1;
+                    } else {
+                        run = 0;
+                    }
+                } else {
+                    /* Use the number of CPUs specified by the test,
+                     * Starting with zero */
+                    if (my_cpu_ord >= tseq[test].cpu_sel) {
+                        run = 0;
+                    }
+                    /* Set the master CPU to the highest CPU number 
+                     * that has been selected */
+                    if (act_cpus < tseq[test].cpu_sel) {
+                        mstr_cpu = act_cpus-1;
+                        run_cpus = act_cpus;
+                    } else {
+                        mstr_cpu = tseq[test].cpu_sel-1;
+                        run_cpus = tseq[test].cpu_sel;
+                    }
+                }
+            }
+            btrace(my_cpu_num, __LINE__, "Sched_CPU1",1,run_cpus,run);
+            barrier();
+            dprint(9, 7, run_cpus, 2, 0);
+
+            /* Setup a sub barrier for only the selected CPUs */
+            if (my_cpu_ord == mstr_cpu) {
+                s_barrier_init(run_cpus);
+            }
+
+            /* Make sure the the sub barrier is ready before proceeding */
+            barrier();
+
+            /* Not selected CPUs go back to the scheduling barrier */
+            if (run == 0 ) {
+                continue;
+            }
+            cprint(8, my_cpu_num+7, "-");
+            btrace(my_cpu_num, __LINE__, "Sched_Win0",1,window,win_next);
+
+            if (my_cpu_ord == mstr_cpu) {
+                switch (window) {
+		    /* Special case for relocation */
+                case 0:
+                    winx.start = 0;
+                    winx.end = win1_end;
+                    window++;
+                    break;
+		    /* Special case for first segment */
+                case 1:
+                    winx.start = win0_start;
+                    winx.end = WIN_SZ_PAGES;
+                    win_next += WIN_SZ_PAGES;
+                    window++;
+                    break;
+		    /* For all other windows */
+                default:
+                    winx.start = win_next;
+                    win_next += WIN_SZ_PAGES;
+                    winx.end = win_next;
+                }
+                btrace(my_cpu_num,__LINE__,"Sched_Win1",1,winx.start,
+                       winx.end);
+
+                /* Find the memory areas to test */
+                segs = compute_segments(winx, my_cpu_num);
+            }
+            s_barrier();
+            btrace(my_cpu_num,__LINE__,"Sched_Win2",1,segs,
+                   vv->map[0].pbase_addr);
+
+            if (segs == 0) {
+		/* No memory in this window so skip it */
+                continue;
+            }
+
+            /* map in the window... */
+            if (map_page(vv->map[0].pbase_addr) < 0) {
+                /* Either there is no PAE or we are at the PAE limit */
+                break;
+            }
+
+            btrace(my_cpu_num, __LINE__, "Strt_Test ",1,my_cpu_num,
+                   my_cpu_ord);
+            do_test(my_cpu_ord);
+            btrace(my_cpu_num, __LINE__, "End_Test  ",1,my_cpu_num,
+                   my_cpu_ord);
+
+            paging_off();
+
+        } /* End of window loop */
+
+        s_barrier();
+        btrace(my_cpu_num, __LINE__, "End_Win   ",1,test, window);
+
+        /* Setup for the next set of windows */
+        win_next = 0;
+        window = 0;
+        bail = 0;
+
+        /* Only the master CPU does the end of test housekeeping */
+        if (my_cpu_ord != mstr_cpu) {
+            continue;
+        }
+
+        /* Special handling for the bit fade test #11 */
+        if (tseq[test].pat == 11 && bitf_seq != 6) {
+            /* Keep going until the sequence is complete. */
+            bitf_seq++;
+            continue;
+        } else {
+            bitf_seq = 0;
+        }
+
+        /* Select advancement of CPUs and next test */
+        switch(cpu_mode) {
+        case CPM_RROBIN:
+            if (++cpu_sel >= act_cpus) {
+                cpu_sel = 0;
+            }
+            next_test();
+            break;
+        case CPM_SEQ:
+            if (++cpu_sel >= act_cpus) {
+                cpu_sel = 0;
+                next_test();
+            }
+            break;
+        case CPM_ALL:
+            if (tseq[test].cpu_sel == -1) 
+            {
+                /* Do the same test for each CPU */
+                if (++cpu_sel >= act_cpus) 
+                {
+	            cpu_sel = 0;
+                    next_test();
+                } else {
+                    continue;
+                }
+            } else {
+                next_test();
+            }
+        } //????
+        btrace(my_cpu_num, __LINE__, "Next_CPU  ",1,cpu_sel,test);
+
+        /* If this was the last test then we finished a pass */
+        if (pass_flag) 
+        {
+            pass_flag = 0;
+			
+            vv->pass++;
+			
+            dprint(LINE_INFO, 49, vv->pass, 5, 0);
+            find_ticks_for_pass();
+            ltest = -1;
+			
+            if (vv->ecount == 0) 
+            {
+                /* If onepass is enabled and we did not get any errors
+                 * reboot to exit the test */
+                if (onepass) {	reboot();   }
+                if (!btflag)
+                    cprint(LINE_MSG, COL_MSG-8,
+                           "** Pass complete, no errors, press Esc to exit **");
+                if(BEEP_END_NO_ERROR) 
+                {
+                    beep(1000);
+                    beep(2000);
+                    beep(1000);
+                    beep(2000);
+                }
+            }
+        }
+
+        bail=0;
+    } /* End test loop */
+\ No newline at end of file
diff --git a/efi_memtest/memtest86+/main.c b/efi_memtest/memtest86+/main.c
index d8eac4a..0ed8d8a 100644
--- a/efi_memtest/memtest86+/main.c
+++ b/efi_memtest/memtest86+/main.c
@@ -42,6 +42,9 @@ extern struct	barrier_s *barr;
 extern int 	num_cpus;
 extern int 	act_cpus;
 
+extern void enable_fp_processing(void);
+extern void setup_mm_modes(void);
+
 static int	find_ticks_for_test(int test);
 void		find_ticks_for_pass(void);
 int		find_chunks(int test);
@@ -378,397 +381,19 @@ void clear_screen()
 /* Test entry point. We get here on startup and also whenever
  * we relocate. */
 void test_start(void)
-{
-    int my_cpu_num, my_cpu_ord, run;
+{  // TODO logger to file or print
+        int my_cpu_num, my_cpu_ord, run;
 
     /* If this is the first time here we are CPU 0 */
     if (start_seq == 0) {
         my_cpu_num = 0;
     } else {
-        my_cpu_num = smp_my_cpu_num();
-    }
-    /* First thing, switch to main stack */
-    switch_to_main_stack(my_cpu_num);
-
-    /* First time (for this CPU) initialization */
-    if (start_seq < 2) {
-
-        /* These steps are only done by the boot cpu */
-        if (my_cpu_num == 0) {
-            my_cpu_ord = cpu_ord++;
-            smp_set_ordinal(my_cpu_num, my_cpu_ord);
-            parse_command_line();
-            clear_screen();
-            /* Initialize the barrier so the lock in btrace will work.
-             * Will get redone later when we know how many CPUs we have */
-            barrier_init(1);
-            btrace(my_cpu_num, __LINE__, "Begin     ", 1, 0, 0);
-            /* Find memory size */
-            mem_size();	/* must be called before initialise_cpus(); */
-            /* Fill in the CPUID table */
-            get_cpuid();
-            /* Startup the other CPUs */
-            start_seq = 1;
-            //initialise_cpus();
-            btrace(my_cpu_num, __LINE__, "BeforeInit", 1, 0, 0);
-            /* Draw the screen and get system information */
-            init();
-
-            /* Set defaults and initialize variables */
-            set_defaults();
-
-            /* Setup base address for testing, 1 MB */
-            win0_start = 0x100;
-
-            /* Set relocation address to 32Mb if there is enough
-             * memory. Otherwise set it to 3Mb */
-            /* Large reloc addr allows for more testing overlap */
-            if ((ulong)vv->pmap[vv->msegs-1].end > 0x2f00) {
-                high_test_adr = 0x2000000;
-            } else {
-                high_test_adr = 0x300000;
-            } 
-            win1_end = (high_test_adr >> 12);
-
-            /* Adjust the map to not test the page at 939k,
-             *  reserved for locks */
-            vv->pmap[0].end--;
-
-            find_ticks_for_pass();
-        } else {
-            /* APs only, Register the APs */
-            btrace(my_cpu_num, __LINE__, "AP_Start  ", 0, my_cpu_num,
-                   cpu_ord);
-            smp_ap_booted(my_cpu_num);
-            /* Asign a sequential CPU ordinal to each active cpu */
-            spin_lock(&barr->mutex);
-            my_cpu_ord = cpu_ord++;
-            smp_set_ordinal(my_cpu_num, my_cpu_ord);
-            spin_unlock(&barr->mutex);
-            btrace(my_cpu_num, __LINE__, "AP_Done   ", 0, my_cpu_num,
-                   my_cpu_ord);
-        }
-
-    } else {
-        /* Unlock after a relocation */
-        spin_unlock(&barr->mutex);
-        /* Get the CPU ordinal since it is lost during relocation */
-        my_cpu_ord = smp_my_ord_num(my_cpu_num);
-        btrace(my_cpu_num, __LINE__, "Reloc_Done",0,my_cpu_num,my_cpu_ord);
+        // TODO my_cpu_num = smp_my_cpu_num();
     }
 
-    /* A barrier to insure that all of the CPUs are done with startup */
-    barrier();
-    btrace(my_cpu_num, __LINE__, "1st Barr  ", 1, my_cpu_num, my_cpu_ord);
-	
-
-    /* Setup Memory Management and measure memory speed, we do it here
-     * because we need all of the available CPUs */
-    if (start_seq < 2) {
-
-        /* Enable floating point processing */
-        if (cpu_id.fid.bits.fpu)
-            __asm__ __volatile__
-                (
-                 "movl %%cr0, %%eax\n\t"
-                 "andl $0x7, %%eax\n\t"
-                 "movl %%eax, %%cr0\n\t"
-                 : :
-                 : "ax"
-                 );
-        if (cpu_id.fid.bits.sse)
-            __asm__ __volatile__
-                (
-                 "movl %%cr4, %%eax\n\t"
-                 "orl $0x00000200, %%eax\n\t"
-                 "movl %%eax, %%cr4\n\t"
-                 : :
-                 : "ax"
-                 );
-
-        btrace(my_cpu_num, __LINE__, "Mem Mgmnt ",
-               1, cpu_id.fid.bits.pae, cpu_id.fid.bits.lm);
-        /* Setup memory management modes */
-        /* If we have PAE, turn it on */
-        if (cpu_id.fid.bits.pae == 1) {
-            __asm__ __volatile__
-                (
-                 "movl %%cr4, %%eax\n\t"
-                 "orl $0x00000020, %%eax\n\t"
-                 "movl %%eax, %%cr4\n\t"
-                 : :
-                 : "ax"
-                 );
-            cprint(LINE_TITLE+1, COL_MODE, "(PAE Mode)");
-        }
-        /* If this is a 64 CPU enable long mode */
-        if (cpu_id.fid.bits.lm == 1) {
-            __asm__ __volatile__
-                (
-                 "movl $0xc0000080, %%ecx\n\t"
-                 "rdmsr\n\t"
-                 "orl $0x00000100, %%eax\n\t"
-                 "wrmsr\n\t"
-                 : :
-                 : "ax", "cx"
-                 );
-            cprint(LINE_TITLE+1, COL_MODE, "(X64 Mode)");
-        }
-        /* Get the memory Speed with all CPUs */
-        get_mem_speed(my_cpu_num, num_cpus);
-    }
-
-    /* Set the initialized flag only after all of the CPU's have
-     * Reached the barrier. This insures that relocation has
-     * been completed for each CPU. */
-    btrace(my_cpu_num, __LINE__, "Start Done", 1, 0, 0);
-    start_seq = 2;
-
-    /* Loop through all tests */
-    while (1) {
-        /* If the restart flag is set all initial params */
-        if (restart_flag) {
-            set_defaults();
-            continue;
-        }
-        /* Skip single CPU tests if we are using only one CPU */
-        if (tseq[test].cpu_sel == -1 && 
-            (num_cpus == 1 || cpu_mode != CPM_ALL)) {
-            test++;
-            continue;
-        }
-
-        test_setup();
-
-        /* Loop through all possible windows */
-        while (win_next <= ((ulong)vv->pmap[vv->msegs-1].end + WIN_SZ_PAGES)) {
-
-            /* Main scheduling barrier */
-            cprint(8, my_cpu_num+7, "W");
-            btrace(my_cpu_num, __LINE__, "Sched_Barr", 1,window,win_next);
-            barrier();
-
-            /* Don't go over the 8TB PAE limit */
-            if (win_next > MAX_MEM_PAGES) {
-                break;
-            }
-
-            /* For the bit fade test, #11, we cannot relocate so bump the
-             * window to 1 */
-            if (tseq[test].pat == 11 && window == 0) {
-                window = 1;
-            }
-
-            /* Relocate if required */
-            if (window != 0 && (ulong)&_start != LOW_TEST_ADR) {
-                btrace(my_cpu_num, __LINE__, "Sched_RelL", 1,0,0);
-                run_at(LOW_TEST_ADR, my_cpu_num);
-            }
-            if (window == 0 && vv->plim_lower >= win0_start) {
-                window++;
-            }
-            if (window == 0 && (ulong)&_start == LOW_TEST_ADR) {
-                btrace(my_cpu_num, __LINE__, "Sched_RelH", 1,0,0);
-                run_at(high_test_adr, my_cpu_num);
-            }
-
-            /* Decide which CPU(s) to use */
-            btrace(my_cpu_num, __LINE__, "Sched_CPU0",1,cpu_sel,
-                   tseq[test].cpu_sel);
-            run = 1;
-            switch(cpu_mode) {
-            case CPM_RROBIN:
-            case CPM_SEQ:
-                /* Select a single CPU */
-                if (my_cpu_ord == cpu_sel) {
-                    mstr_cpu = cpu_sel;
-                    run_cpus = 1;
-                } else {
-                    run = 0;
-                }
-                break;
-            case CPM_ALL:
-                /* Use all CPUs */
-                if (tseq[test].cpu_sel == -1) {
-                    /* Round robin through all of the CPUs */
-                    if (my_cpu_ord == cpu_sel) {
-                        mstr_cpu = cpu_sel;
-                        run_cpus = 1;
-                    } else {
-                        run = 0;
-                    }
-                } else {
-                    /* Use the number of CPUs specified by the test,
-                     * Starting with zero */
-                    if (my_cpu_ord >= tseq[test].cpu_sel) {
-                        run = 0;
-                    }
-                    /* Set the master CPU to the highest CPU number 
-                     * that has been selected */
-                    if (act_cpus < tseq[test].cpu_sel) {
-                        mstr_cpu = act_cpus-1;
-                        run_cpus = act_cpus;
-                    } else {
-                        mstr_cpu = tseq[test].cpu_sel-1;
-                        run_cpus = tseq[test].cpu_sel;
-                    }
-                }
-            }
-            btrace(my_cpu_num, __LINE__, "Sched_CPU1",1,run_cpus,run);
-            barrier();
-            dprint(9, 7, run_cpus, 2, 0);
-
-            /* Setup a sub barrier for only the selected CPUs */
-            if (my_cpu_ord == mstr_cpu) {
-                s_barrier_init(run_cpus);
-            }
-
-            /* Make sure the the sub barrier is ready before proceeding */
-            barrier();
-
-            /* Not selected CPUs go back to the scheduling barrier */
-            if (run == 0 ) {
-                continue;
-            }
-            cprint(8, my_cpu_num+7, "-");
-            btrace(my_cpu_num, __LINE__, "Sched_Win0",1,window,win_next);
-
-            if (my_cpu_ord == mstr_cpu) {
-                switch (window) {
-		    /* Special case for relocation */
-                case 0:
-                    winx.start = 0;
-                    winx.end = win1_end;
-                    window++;
-                    break;
-		    /* Special case for first segment */
-                case 1:
-                    winx.start = win0_start;
-                    winx.end = WIN_SZ_PAGES;
-                    win_next += WIN_SZ_PAGES;
-                    window++;
-                    break;
-		    /* For all other windows */
-                default:
-                    winx.start = win_next;
-                    win_next += WIN_SZ_PAGES;
-                    winx.end = win_next;
-                }
-                btrace(my_cpu_num,__LINE__,"Sched_Win1",1,winx.start,
-                       winx.end);
-
-                /* Find the memory areas to test */
-                segs = compute_segments(winx, my_cpu_num);
-            }
-            s_barrier();
-            btrace(my_cpu_num,__LINE__,"Sched_Win2",1,segs,
-                   vv->map[0].pbase_addr);
-
-            if (segs == 0) {
-		/* No memory in this window so skip it */
-                continue;
-            }
-
-            /* map in the window... */
-            if (map_page(vv->map[0].pbase_addr) < 0) {
-                /* Either there is no PAE or we are at the PAE limit */
-                break;
-            }
-
-            btrace(my_cpu_num, __LINE__, "Strt_Test ",1,my_cpu_num,
-                   my_cpu_ord);
-            do_test(my_cpu_ord);
-            btrace(my_cpu_num, __LINE__, "End_Test  ",1,my_cpu_num,
-                   my_cpu_ord);
-
-            paging_off();
-
-        } /* End of window loop */
-
-        s_barrier();
-        btrace(my_cpu_num, __LINE__, "End_Win   ",1,test, window);
-
-        /* Setup for the next set of windows */
-        win_next = 0;
-        window = 0;
-        bail = 0;
-
-        /* Only the master CPU does the end of test housekeeping */
-        if (my_cpu_ord != mstr_cpu) {
-            continue;
-        }
-
-        /* Special handling for the bit fade test #11 */
-        if (tseq[test].pat == 11 && bitf_seq != 6) {
-            /* Keep going until the sequence is complete. */
-            bitf_seq++;
-            continue;
-        } else {
-            bitf_seq = 0;
-        }
-
-        /* Select advancement of CPUs and next test */
-        switch(cpu_mode) {
-        case CPM_RROBIN:
-            if (++cpu_sel >= act_cpus) {
-                cpu_sel = 0;
-            }
-            next_test();
-            break;
-        case CPM_SEQ:
-            if (++cpu_sel >= act_cpus) {
-                cpu_sel = 0;
-                next_test();
-            }
-            break;
-        case CPM_ALL:
-            if (tseq[test].cpu_sel == -1) 
-            {
-                /* Do the same test for each CPU */
-                if (++cpu_sel >= act_cpus) 
-                {
-	            cpu_sel = 0;
-                    next_test();
-                } else {
-                    continue;
-                }
-            } else {
-                next_test();
-            }
-        } //????
-        btrace(my_cpu_num, __LINE__, "Next_CPU  ",1,cpu_sel,test);
-
-        /* If this was the last test then we finished a pass */
-        if (pass_flag) 
-        {
-            pass_flag = 0;
-			
-            vv->pass++;
-			
-            dprint(LINE_INFO, 49, vv->pass, 5, 0);
-            find_ticks_for_pass();
-            ltest = -1;
-			
-            if (vv->ecount == 0) 
-            {
-                /* If onepass is enabled and we did not get any errors
-                 * reboot to exit the test */
-                if (onepass) {	reboot();   }
-                if (!btflag)
-                    cprint(LINE_MSG, COL_MSG-8,
-                           "** Pass complete, no errors, press Esc to exit **");
-                if(BEEP_END_NO_ERROR) 
-                {
-                    beep(1000);
-                    beep(2000);
-                    beep(1000);
-                    beep(2000);
-                }
-            }
-        }
-
-        bail=0;
-    } /* End test loop */
+    /* First thing, switch to main stack */
+    // TODO create head.S to get the boot_stack poiter?
+    //switch_to_main_stack(my_cpu_num);
 }
 
 void test_setup()
diff --git a/efi_memtest/memtest86+/test.h b/efi_memtest/memtest86+/test.h
index 8b2e924..ccf6b66 100644
--- a/efi_memtest/memtest86+/test.h
+++ b/efi_memtest/memtest86+/test.h
@@ -4,6 +4,8 @@
  * By Chris Brady
  */
 
+#include "test_cache.h"
+
 #ifndef _TEST_H_
 #define _TEST_H_
 #define E88     0x00
@@ -227,26 +229,6 @@ struct pair {
        ulong mask;
 };
 
-static inline void cache_off(void)
-{
-    asm(
-        "push %eax\n\t"
-        "movl %cr0,%eax\n\t"
-        "orl $0x40000000,%eax\n\t"  /* Set CD */
-        "movl %eax,%cr0\n\t"
-        "wbinvd\n\t"
-        "pop  %eax\n\t");
-}
-
-static inline void cache_on(void)
-{
-    asm(
-        "push %eax\n\t"
-        "movl %cr0,%eax\n\t"
-        "andl $0x9fffffff,%eax\n\t" /* Clear CD and NW */ 
-        "movl %eax,%cr0\n\t"
-        "pop  %eax\n\t");
-}
 
 struct mmap {
     ulong pbase_addr;