diff options
Diffstat (limited to 'memtestEDK/Memtest/SingleComponents/test.c')
-rw-r--r-- | memtestEDK/Memtest/SingleComponents/test.c | 1549 |
1 files changed, 1549 insertions, 0 deletions
diff --git a/memtestEDK/Memtest/SingleComponents/test.c b/memtestEDK/Memtest/SingleComponents/test.c new file mode 100644 index 0000000..864dfcc --- /dev/null +++ b/memtestEDK/Memtest/SingleComponents/test.c @@ -0,0 +1,1549 @@ +/* test.c - MemTest-86 Version 3.4 + * + * Released under version 2 of the Gnu Public License. + * By Chris Brady + * ---------------------------------------------------- + * MemTest86+ V5 Specific code (GPL V2.0) + * By Samuel DEMEULEMEESTER, sdemeule@memtest.org + * http://www.canardpc.com - http://www.memtest.org + * Thanks to Passmark for calculate_chunk() and various comments ! + */ + +#include "test.h" +#include "config.h" +#include "stdint.h" +#include "cpuid.h" +#include "smp.h" +#include "io.h" + +extern struct cpu_ident cpu_id; +extern volatile int mstr_cpu; +extern volatile int run_cpus; +extern volatile int test; +extern volatile int segs, bail; +extern int test_ticks, nticks; +extern struct tseq tseq[]; +extern void update_err_counts(void); +extern void print_err_counts(void); +void rand_seed( unsigned int seed1, unsigned int seed2, int me); +ulong rand(int me); +void poll_errors(); + +// NOTE(jcoiner): +// Defining 'STATIC' to empty string results in crashes. (It should +// work fine, of course.) I suspect relocation problems in reloc.c. +// When we declare these routines static, we use relative addresses +// for them instead of looking up their addresses in (supposedly +// relocated) global elf tables, which avoids the crashes. + +#define STATIC static +//#define STATIC + +#define PREFER_C 0 + +static const void* const nullptr = 0x0; + +// Writes *start and *end with the VA range to test. +// +// me - this threads CPU number +// j - index into v->map for current segment we are testing +// align - number of bytes to align each block to +STATIC void calculate_chunk(ulong** start, ulong** end, int me, + int j, int makeMultipleOf) { + ulong chunk; + + // If we are only running 1 CPU then test the whole block + if (run_cpus == 1) { + *start = vv->map[j].start; + *end = vv->map[j].end; + } else { + + // Divide the current segment by the number of CPUs + chunk = (ulong)vv->map[j].end-(ulong)vv->map[j].start; + chunk /= run_cpus; + + // Round down to the nearest desired bitlength multiple + chunk = (chunk + (makeMultipleOf-1)) & ~(makeMultipleOf-1); + + // Figure out chunk boundaries + *start = (ulong*)((ulong)vv->map[j].start+(chunk*me)); + /* Set end addrs for the highest CPU num to the + * end of the segment for rounding errors */ + /* Also rounds down to boundary if needed, may miss some ram but + better than crashing or producing false errors. */ + /* This rounding probably will never happen as the segments should + be in 4096 bytes pages if I understand correctly. */ + if (me == mstr_cpu) { + *end = (ulong*)(vv->map[j].end); + } else { + *end = (ulong*)((ulong)(*start) + chunk); + (*end)--; + } + } +} + +/* Call segment_fn() for each up-to-SPINSZ segment between + * 'start' and 'end'. + */ +void foreach_segment +(ulong* start, ulong* end, + int me, const void* ctx, segment_fn func) { + + ASSERT(start < end); + + // Confirm 'start' points to an even dword, and 'end' + // should point to an odd dword + ASSERT(0 == (((ulong)start) & 0x7)); + ASSERT(0x4 == (((ulong)end) & 0x7)); + + // 'end' may be exactly 0xfffffffc, right at the 4GB boundary. + // + // To avoid overflow in our loop tests and length calculations, + // use dword indices (the '_dw' vars) to avoid overflows. + ulong start_dw = ((ulong)start) >> 2; + ulong end_dw = ((ulong) end) >> 2; + + // end is always xxxxxffc, but increment end_dw to an + // address beyond the segment for easier boundary calculations. + ++end_dw; + + ulong seg_dw = start_dw; + ulong seg_end_dw = start_dw; + + int done = 0; + do { + do_tick(me); + { BAILR } + + // ensure no overflow + ASSERT((seg_end_dw + SPINSZ_DWORDS) > seg_end_dw); + seg_end_dw += SPINSZ_DWORDS; + + if (seg_end_dw >= end_dw) { + seg_end_dw = end_dw; + done++; + } + if (seg_dw == seg_end_dw) { + break; + } + + ASSERT(((ulong)seg_end_dw) <= 0x40000000); + ASSERT(seg_end_dw > seg_dw); + ulong seg_len_dw = seg_end_dw - seg_dw; + + func((ulong*)(seg_dw << 2), seg_len_dw, ctx); + + seg_dw = seg_end_dw; + } while (!done); +} + +/* Calls segment_fn() for each segment in vv->map. + * + * Does not slice by CPU number, so it covers the entire memory. + * Contrast to sliced_foreach_segment(). + */ +STATIC void unsliced_foreach_segment +(const void* ctx, int me, segment_fn func) { + int j; + for (j=0; j<segs; j++) { + foreach_segment(vv->map[j].start, + vv->map[j].end, + me, ctx, func); + } +} + +/* Calls segment_fn() for each segment to be tested by CPU 'me'. + * + * In multicore mode, slices the segments by 'me' (the CPU ordinal + * number) so that each call will cover only 1/Nth of memory. + */ +STATIC void sliced_foreach_segment +(const void *ctx, int me, segment_fn func) { + int j; + ulong *start, *end; // VAs + ulong* prev_end = 0; + for (j=0; j<segs; j++) { + calculate_chunk(&start, &end, me, j, 64); + + // Ensure no overlap among chunks + ASSERT(end > start); + if (prev_end > 0) { + ASSERT(prev_end < start); + } + prev_end = end; + + foreach_segment(start, end, me, ctx, func); + } +} + +STATIC void addr_tst1_seg(ulong* restrict buf, + ulong len_dw, const void* unused) { + // Within each segment: + // - choose a low dword offset 'off' + // - write pat to *off + // - write ~pat to addresses that are above off by + // 1, 2, 4, ... dwords up to the top of the segment. None + // should alias to the original dword. + // - write ~pat to addresses that are below off by + // 1, 2, 4, etc dwords, down to the start of the segment. None + // should alias to the original dword. If adding a given offset + // doesn't produce a single bit address flip (because it produced + // a carry) subtracting the same offset should give a single bit flip. + // - repeat this, moving off ahead in increments of 1MB; + // this covers address bits within physical memory banks, we hope? + + ulong pat; + int k; + + for (pat=0x5555aaaa, k=0; k<2; k++) { + hprint(LINE_PAT, COL_PAT, pat); + + for (ulong off_dw = 0; off_dw < len_dw; off_dw += (1 << 18)) { + buf[off_dw] = pat; + pat = ~pat; + + for (ulong more_off_dw = 1; off_dw + more_off_dw < len_dw; + more_off_dw = more_off_dw << 1) { + ASSERT(more_off_dw); // it should never get to zero + buf[off_dw + more_off_dw] = pat; + ulong bad; + if ((bad = buf[off_dw]) != ~pat) { + ad_err1(buf + off_dw, + buf + off_dw + more_off_dw, + bad, ~pat); + break; + } + } + for (ulong more_off_dw = 1; off_dw > more_off_dw; + more_off_dw = more_off_dw << 1) { + ASSERT(more_off_dw); // it should never get to zero + buf[off_dw - more_off_dw] = pat; + ulong bad; + if ((bad = buf[off_dw]) != ~pat) { + ad_err1(buf + off_dw, + buf + off_dw - more_off_dw, + bad, ~pat); + break; + } + } + } + } +} + +/* + * Memory address test, walking ones + */ +void addr_tst1(int me) +{ + unsliced_foreach_segment(nullptr, me, addr_tst1_seg); +} + +STATIC void addr_tst2_init_segment(ulong* p, + ulong len_dw, const void* unused) { + ulong* pe = p + (len_dw - 1); + + /* Original C code replaced with hand tuned assembly code + * for (; p <= pe; p++) { + * *p = (ulong)p; + * } + */ + asm __volatile__ ( + "jmp L91\n\t" + ".p2align 4,,7\n\t" + "L90:\n\t" + "addl $4,%%edi\n\t" + "L91:\n\t" + "movl %%edi,(%%edi)\n\t" + "cmpl %%edx,%%edi\n\t" + "jb L90\n\t" + : : "D" (p), "d" (pe) + ); +} + +STATIC void addr_tst2_check_segment(ulong* p, + ulong len_dw, const void* unused) { + ulong* pe = p + (len_dw - 1); + + /* Original C code replaced with hand tuned assembly code + * for (; p <= pe; p++) { + * if((bad = *p) != (ulong)p) { + * ad_err2((ulong)p, bad); + * } + * } + */ + asm __volatile__ + ( + "jmp L95\n\t" + ".p2align 4,,7\n\t" + "L99:\n\t" + "addl $4,%%edi\n\t" + "L95:\n\t" + "movl (%%edi),%%ecx\n\t" + "cmpl %%edi,%%ecx\n\t" + "jne L97\n\t" + "L96:\n\t" + "cmpl %%edx,%%edi\n\t" + "jb L99\n\t" + "jmp L98\n\t" + + "L97:\n\t" + "pushl %%edx\n\t" + "pushl %%ecx\n\t" + "pushl %%edi\n\t" + "call ad_err2\n\t" + "popl %%edi\n\t" + "popl %%ecx\n\t" + "popl %%edx\n\t" + "jmp L96\n\t" + + "L98:\n\t" + : : "D" (p), "d" (pe) + : "ecx" + ); +} + +/* + * Memory address test, own address + */ +void addr_tst2(int me) +{ + cprint(LINE_PAT, COL_PAT, "address "); + + /* Write each address with its own address */ + unsliced_foreach_segment(nullptr, me, addr_tst2_init_segment); + { BAILR } + + /* Each address should have its own address */ + unsliced_foreach_segment(nullptr, me, addr_tst2_check_segment); +} + +typedef struct { + int me; + ulong xorVal; +} movinvr_ctx; + +STATIC void movinvr_init(ulong* p, + ulong len_dw, const void* vctx) { + ulong* pe = p + (len_dw - 1); + const movinvr_ctx* ctx = (const movinvr_ctx*)vctx; + /* Original C code replaced with hand tuned assembly code */ + /* + for (; p <= pe; p++) { + *p = rand(me); + } + */ + + asm __volatile__ + ( + "jmp L200\n\t" + ".p2align 4,,7\n\t" + "L201:\n\t" + "addl $4,%%edi\n\t" + "L200:\n\t" + "pushl %%ecx\n\t" + "call rand\n\t" + "popl %%ecx\n\t" + "movl %%eax,(%%edi)\n\t" + "cmpl %%ebx,%%edi\n\t" + "jb L201\n\t" + : : "D" (p), "b" (pe), "c" (ctx->me) + : "eax" + ); +} + +STATIC void movinvr_body(ulong* p, ulong len_dw, const void* vctx) { + ulong* pe = p + (len_dw - 1); + const movinvr_ctx* ctx = (const movinvr_ctx*)vctx; + + /* Original C code replaced with hand tuned assembly code */ + + /*for (; p <= pe; p++) { + num = rand(me); + if (i) { + num = ~num; + } + if ((bad=*p) != num) { + mt86_error((ulong*)p, num, bad); + } + *p = ~num; + }*/ + + asm __volatile__ + ( + "pushl %%ebp\n\t" + + // Skip first increment + "jmp L26\n\t" + ".p2align 4,,7\n\t" + + // increment 4 bytes (32-bits) + "L27:\n\t" + "addl $4,%%edi\n\t" + + // Check this byte + "L26:\n\t" + + // Get next random number, pass in me(edx), random value returned in num(eax) + // num = rand(me); + // cdecl call maintains all registers except eax, ecx, and edx + // We maintain edx with a push and pop here using it also as an input + // we don't need the current eax value and want it to change to the return value + // we overwrite ecx shortly after this discarding its current value + "pushl %%edx\n\t" // Push function inputs onto stack + "call rand\n\t" + "popl %%edx\n\t" // Remove function inputs from stack + + // XOR the random number with xorVal(ebx), which is either 0xffffffff or 0 depending on the outer loop + // if (i) { num = ~num; } + "xorl %%ebx,%%eax\n\t" + + // Move the current value of the current position p(edi) into bad(ecx) + // (bad=*p) + "movl (%%edi),%%ecx\n\t" + + // Compare bad(ecx) to num(eax) + "cmpl %%eax,%%ecx\n\t" + + // If not equal jump the error case + "jne L23\n\t" + + // Set a new value or not num(eax) at the current position p(edi) + // *p = ~num; + "L25:\n\t" + "movl $0xffffffff,%%ebp\n\t" + "xorl %%ebp,%%eax\n\t" + "movl %%eax,(%%edi)\n\t" + + // Loop until current position p(edi) equals the end position pe(esi) + "cmpl %%esi,%%edi\n\t" + "jb L27\n\t" + "jmp L24\n" + + // Error case + "L23:\n\t" + // Must manually maintain eax, ecx, and edx as part of cdecl call convention + "pushl %%edx\n\t" + "pushl %%ecx\n\t" // Next three pushes are functions input + "pushl %%eax\n\t" + "pushl %%edi\n\t" + "call mt86_error\n\t" + "popl %%edi\n\t" // Remove function inputs from stack and restore register values + "popl %%eax\n\t" + "popl %%ecx\n\t" + "popl %%edx\n\t" + "jmp L25\n" + + "L24:\n\t" + "popl %%ebp\n\t" + :: "D" (p), "S" (pe), "b" (ctx->xorVal), + "d" (ctx->me) + : "eax", "ecx" + ); +} + +/* + * Test all of memory using a "half moving inversions" algorithm using random + * numbers and their complement as the data pattern. Since we are not able to + * produce random numbers in reverse order testing is only done in the forward + * direction. + */ +void movinvr(int me) +{ + int i, seed1, seed2; + + movinvr_ctx ctx; + ctx.me = me; + ctx.xorVal = 0; + + /* Initialize memory with initial sequence of random numbers. */ + if (cpu_id.fid.bits.rdtsc) { + asm __volatile__ ("rdtsc":"=a" (seed1),"=d" (seed2)); + } else { + seed1 = 521288629 + vv->pass; + seed2 = 362436069 - vv->pass; + } + + /* Display the current seed */ + if (mstr_cpu == me) hprint(LINE_PAT, COL_PAT, seed1); + rand_seed(seed1, seed2, me); + + sliced_foreach_segment(&ctx, me, movinvr_init); + { BAILR } + + /* Do moving inversions test. Check for initial pattern and then + * write the complement for each memory location. + */ + for (i=0; i<2; i++) { + rand_seed(seed1, seed2, me); + + if (i) { + ctx.xorVal = 0xffffffff; + } else { + ctx.xorVal = 0; + } + + sliced_foreach_segment(&ctx, me, movinvr_body); + { BAILR } + } +} + +typedef struct { + ulong p1; + ulong p2; +} movinv1_ctx; + +STATIC void movinv1_init(ulong* start, + ulong len_dw, const void* vctx) { + const movinv1_ctx* ctx = (const movinv1_ctx*)vctx; + + ulong p1 = ctx->p1; + ulong* p = start; + + asm __volatile__ + ( + "rep\n\t" + "stosl\n\t" + : : "c" (len_dw), "D" (p), "a" (p1) + ); +} + +STATIC void movinv1_bottom_up(ulong* start, + ulong len_dw, const void* vctx) { + const movinv1_ctx* ctx = (const movinv1_ctx*)vctx; + ulong p1 = ctx->p1; + ulong p2 = ctx->p2; + ulong* p = start; + ulong* pe = p + (len_dw - 1); + + // Original C code replaced with hand tuned assembly code + // seems broken + /*for (; p <= pe; p++) { + if ((bad=*p) != p1) { + mt86_error((ulong*)p, p1, bad); + } + *p = p2; + }*/ + + asm __volatile__ + ( + "jmp L2\n\t" + ".p2align 4,,7\n\t" + "L0:\n\t" + "addl $4,%%edi\n\t" + "L2:\n\t" + "movl (%%edi),%%ecx\n\t" + "cmpl %%eax,%%ecx\n\t" + "jne L3\n\t" + "L5:\n\t" + "movl %%ebx,(%%edi)\n\t" + "cmpl %%edx,%%edi\n\t" + "jb L0\n\t" + "jmp L4\n" + + "L3:\n\t" + "pushl %%edx\n\t" + "pushl %%ebx\n\t" + "pushl %%ecx\n\t" + "pushl %%eax\n\t" + "pushl %%edi\n\t" + "call mt86_error\n\t" + "popl %%edi\n\t" + "popl %%eax\n\t" + "popl %%ecx\n\t" + "popl %%ebx\n\t" + "popl %%edx\n\t" + "jmp L5\n" + + "L4:\n\t" + :: "a" (p1), "D" (p), "d" (pe), "b" (p2) + : "ecx" + ); +} + +STATIC void movinv1_top_down(ulong* start, + ulong len_dw, const void* vctx) { + const movinv1_ctx* ctx = (const movinv1_ctx*)vctx; + ulong p1 = ctx->p1; + ulong p2 = ctx->p2; + ulong* p = start + (len_dw - 1); + ulong* pe = start; + + //Original C code replaced with hand tuned assembly code + // seems broken + /*do { + if ((bad=*p) != p2) { + mt86_error((ulong*)p, p2, bad); + } + *p = p1; + } while (--p >= pe);*/ + + asm __volatile__ + ( + "jmp L9\n\t" + ".p2align 4,,7\n\t" + "L11:\n\t" + "subl $4, %%edi\n\t" + "L9:\n\t" + "movl (%%edi),%%ecx\n\t" + "cmpl %%ebx,%%ecx\n\t" + "jne L6\n\t" + "L10:\n\t" + "movl %%eax,(%%edi)\n\t" + "cmpl %%edi, %%edx\n\t" + "jne L11\n\t" + "jmp L7\n\t" + + "L6:\n\t" + "pushl %%edx\n\t" + "pushl %%eax\n\t" + "pushl %%ecx\n\t" + "pushl %%ebx\n\t" + "pushl %%edi\n\t" + "call mt86_error\n\t" + "popl %%edi\n\t" + "popl %%ebx\n\t" + "popl %%ecx\n\t" + "popl %%eax\n\t" + "popl %%edx\n\t" + "jmp L10\n" + + "L7:\n\t" + :: "a" (p1), "D" (p), "d" (pe), "b" (p2) + : "ecx" + ); +} + +/* + * Test all of memory using a "moving inversions" algorithm using the + * pattern in p1 and its complement in p2. + */ +void movinv1 (int iter, ulong p1, ulong p2, int me) +{ + int i; + + /* Display the current pattern */ + if (mstr_cpu == me) hprint(LINE_PAT, COL_PAT, p1); + + movinv1_ctx ctx; + ctx.p1 = p1; + ctx.p2 = p2; + sliced_foreach_segment(&ctx, me, movinv1_init); + { BAILR } + + /* Do moving inversions test. Check for initial pattern and then + * write the complement for each memory location. Test from bottom + * up and then from the top down. */ + for (i=0; i<iter; i++) { + sliced_foreach_segment(&ctx, me, movinv1_bottom_up); + { BAILR } + + // NOTE(jcoiner): + // For the top-down pass, the original 5.01 code iterated over + // 'segs' in from n-1 down to 0, and then within each mapped segment, + // it would form the SPINSZ windows from the top down -- thus forming + // a different set of windows than the bottom-up pass, when the segment + // is not an integer number of windows. + // + // My guess is that this buys us very little additional coverage, that + // the value in going top-down happens at the word or cache-line level + // and that there's little to be gained from reversing the direction of + // the outer loops. So I'm leaving a 'direction' bit off of the + // foreach_segment() routines for now. + sliced_foreach_segment(&ctx, me, movinv1_top_down); + { BAILR } + } +} + +typedef struct { + ulong p1; + ulong lb; + ulong hb; + int sval; + int off; +} movinv32_ctx; + +STATIC void movinv32_init(ulong* restrict buf, + ulong len_dw, const void* vctx) { + const movinv32_ctx* restrict ctx = (const movinv32_ctx*)vctx; + + ulong* p = buf; + ulong* pe = buf + (len_dw - 1); + + int k = ctx->off; + ulong pat = ctx->p1; + ulong lb = ctx->lb; + int sval = ctx->sval; + + /* Original C code replaced with hand tuned assembly code + * while (p <= pe) { + * *p = pat; + * if (++k >= 32) { + * pat = lb; + * k = 0; + * } else { + * pat = pat << 1; + * pat |= sval; + * } + * p++; + * } + */ + asm __volatile__ + ( + "jmp L20\n\t" + ".p2align 4,,7\n\t" + "L923:\n\t" + "addl $4,%%edi\n\t" + "L20:\n\t" + "movl %%ecx,(%%edi)\n\t" + "addl $1,%%ebx\n\t" + "cmpl $32,%%ebx\n\t" + "jne L21\n\t" + "movl %%esi,%%ecx\n\t" + "xorl %%ebx,%%ebx\n\t" + "jmp L22\n" + "L21:\n\t" + "shll $1,%%ecx\n\t" + "orl %%eax,%%ecx\n\t" + "L22:\n\t" + "cmpl %%edx,%%edi\n\t" + "jb L923\n\t" + :: "D" (p),"d" (pe),"b" (k),"c" (pat), + "a" (sval), "S" (lb) + ); +} + +STATIC void movinv32_bottom_up(ulong* restrict buf, ulong len_dw, + const void* vctx) { + const movinv32_ctx* restrict ctx = (const movinv32_ctx*)vctx; + + ulong* p = buf; + ulong* pe = buf + (len_dw - 1); + + int k = ctx->off; + ulong pat = ctx->p1; + ulong lb = ctx->lb; + int sval = ctx->sval; + + /* Original C code replaced with hand tuned assembly code + * while (1) { + * if ((bad=*p) != pat) { + * mt86_error((ulong*)p, pat, bad); + * } + * *p = ~pat; + * if (p >= pe) break; + * p++; + * + * if (++k >= 32) { + * pat = lb; + * k = 0; + * } else { + * pat = pat << 1; + * pat |= sval; + * } + * } + */ + asm __volatile__ + ( + "pushl %%ebp\n\t" + "jmp L30\n\t" + ".p2align 4,,7\n\t" + "L930:\n\t" + "addl $4,%%edi\n\t" + "L30:\n\t" + "movl (%%edi),%%ebp\n\t" + "cmpl %%ecx,%%ebp\n\t" + "jne L34\n\t" + + "L35:\n\t" + "notl %%ecx\n\t" + "movl %%ecx,(%%edi)\n\t" + "notl %%ecx\n\t" + "incl %%ebx\n\t" + "cmpl $32,%%ebx\n\t" + "jne L31\n\t" + "movl %%esi,%%ecx\n\t" + "xorl %%ebx,%%ebx\n\t" + "jmp L32\n" + "L31:\n\t" + "shll $1,%%ecx\n\t" + "orl %%eax,%%ecx\n\t" + "L32:\n\t" + "cmpl %%edx,%%edi\n\t" + "jb L930\n\t" + "jmp L33\n\t" + + "L34:\n\t" + "pushl %%esi\n\t" + "pushl %%eax\n\t" + "pushl %%ebx\n\t" + "pushl %%edx\n\t" + "pushl %%ebp\n\t" + "pushl %%ecx\n\t" + "pushl %%edi\n\t" + "call mt86_error\n\t" + "popl %%edi\n\t" + "popl %%ecx\n\t" + "popl %%ebp\n\t" + "popl %%edx\n\t" + "popl %%ebx\n\t" + "popl %%eax\n\t" + "popl %%esi\n\t" + "jmp L35\n" + + "L33:\n\t" + "popl %%ebp\n\t" + : "=b" (k),"=c" (pat) + : "D" (p),"d" (pe),"b" (k),"c" (pat), + "a" (sval), "S" (lb) + ); +} + +STATIC void movinv32_top_down(ulong* restrict buf, + ulong len_dw, const void* vctx) { + const movinv32_ctx* restrict ctx = (const movinv32_ctx*)vctx; + + ulong* pe = buf; + ulong* p = buf + (len_dw - 1); + + int k = ctx->off; + ulong pat = ctx->p1; + ulong hb = ctx->hb; + int sval = ctx->sval; + ulong p3 = (ulong)sval << 31; + + // Advance 'k' and 'pat' to where they would have been + // at the end of the corresponding bottom_up segment. + // + // The '-1' is because we didn't advance 'k' or 'pat' + // on the final bottom_up loop, so they're off by one... + ulong mod_len = (len_dw - 1) % 32; + for (int i = 0; i < mod_len; i++) { + if (++k >= 32) { + pat = ctx->lb; + k = 0; + } else { + pat = pat << 1; + pat |= sval; + } + } + + // Increment 'k' only because the code below has an off-by-one + // interpretation of 'k' relative to the bottom_up routine. + // There it ranges from 0:31, and here it ranges from 1:32. + k++; + + /* Original C code replaced with hand tuned assembly code */ +#if PREFER_C + ulong bad; + while(1) { + if ((bad=*p) != ~pat) { + mt86_error((ulong*)p, ~pat, bad); + } + *p = pat; + if (p <= pe) break; + p--; + + if (--k <= 0) { + k = 32; + pat = hb; + } else { + pat = pat >> 1; + pat |= p3; + } + }; +#else + asm __volatile__ + ( + "pushl %%ebp\n\t" + "jmp L40\n\t" + ".p2align 4,,7\n\t" + "L49:\n\t" + "subl $4,%%edi\n\t" + "L40:\n\t" + "movl (%%edi),%%ebp\n\t" + "notl %%ecx\n\t" + "cmpl %%ecx,%%ebp\n\t" + "jne L44\n\t" + + "L45:\n\t" + "notl %%ecx\n\t" + "movl %%ecx,(%%edi)\n\t" + "decl %%ebx\n\t" + "cmpl $0,%%ebx\n\t" + "jg L41\n\t" + "movl %%esi,%%ecx\n\t" + "movl $32,%%ebx\n\t" + "jmp L42\n" + "L41:\n\t" + "shrl $1,%%ecx\n\t" + "orl %%eax,%%ecx\n\t" + "L42:\n\t" + "cmpl %%edx,%%edi\n\t" + "ja L49\n\t" + "jmp L43\n\t" + + "L44:\n\t" + "pushl %%esi\n\t" + "pushl %%eax\n\t" + "pushl %%ebx\n\t" + "pushl %%edx\n\t" + "pushl %%ebp\n\t" + "pushl %%ecx\n\t" + "pushl %%edi\n\t" + "call mt86_error\n\t" + "popl %%edi\n\t" + "popl %%ecx\n\t" + "popl %%ebp\n\t" + "popl %%edx\n\t" + "popl %%ebx\n\t" + "popl %%eax\n\t" + "popl %%esi\n\t" + "jmp L45\n" + + "L43:\n\t" + "popl %%ebp\n\t" + : : "D" (p),"d" (pe),"b" (k),"c" (pat), + "a" (p3), "S" (hb) + ); +#endif +} + +void movinv32(int iter, ulong p1, ulong lb, ulong hb, int sval, int off,int me) +{ + // First callsite: + // - p1 has 1 bit set (somewhere) + // - lb = 1 ("low bit") + // - hb = 0x80000000 ("high bit") + // - sval = 0 + // - 'off' indicates the position of the set bit in p1 + // + // Second callsite is the same, but inverted: + // - p1 has 1 bit clear (somewhere) + // - lb = 0xfffffffe + // - hb = 0x7fffffff + // - sval = 1 + // - 'off' indicates the position of the cleared bit in p1 + + movinv32_ctx ctx; + ctx.p1 = p1; + ctx.lb = lb; + ctx.hb = hb; + ctx.sval = sval; + ctx.off = off; + + /* Display the current pattern */ + if (mstr_cpu == me) hprint(LINE_PAT, COL_PAT, p1); + + sliced_foreach_segment(&ctx, me, movinv32_init); + { BAILR } + + /* Do moving inversions test. Check for initial pattern and then + * write the complement for each memory location. Test from bottom + * up and then from the top down. */ + for (int i=0; i<iter; i++) { + sliced_foreach_segment(&ctx, me, movinv32_bottom_up); + { BAILR } + + sliced_foreach_segment(&ctx, me, movinv32_top_down); + { BAILR } + } +} + +typedef struct { + int offset; + ulong p1; + ulong p2; +} modtst_ctx; + +STATIC void modtst_sparse_writes(ulong* restrict start, + ulong len_dw, const void* vctx) { + const modtst_ctx* restrict ctx = (const modtst_ctx*)vctx; + ulong p1 = ctx->p1; + ulong offset = ctx->offset; + +#if PREFER_C + for (ulong i = offset; i < len_dw; i += MOD_SZ) { + start[i] = p1; + } +#else + ulong* p = start + offset; + ulong* pe = start + len_dw; + asm __volatile__ + ( + "jmp L60\n\t" + ".p2align 4,,7\n\t" + + "L60:\n\t" + "movl %%eax,(%%edi)\n\t" + "addl $80,%%edi\n\t" + "cmpl %%edx,%%edi\n\t" + "jb L60\n\t" + :: "D" (p), "d" (pe), "a" (p1) + ); +#endif +} + +STATIC void modtst_dense_writes(ulong* restrict start, ulong len_dw, + const void* vctx) { + const modtst_ctx* restrict ctx = (const modtst_ctx*)vctx; + ulong p2 = ctx->p2; + ulong offset = ctx->offset; + + ASSERT(offset < MOD_SZ); + + ulong k = 0; +#if PREFER_C + for (ulong i = 0; i < len_dw; i++) { + if (k != offset) { + start[i] = p2; + } + if (++k >= MOD_SZ) { + k = 0; + } + } +#else + ulong* pe = start + (len_dw - 1); + asm __volatile__ + ( + "jmp L50\n\t" + ".p2align 4,,7\n\t" + + "L54:\n\t" + "addl $4,%%edi\n\t" + "L50:\n\t" + "cmpl %%ebx,%%ecx\n\t" + "je L52\n\t" + "movl %%eax,(%%edi)\n\t" + "L52:\n\t" + "incl %%ebx\n\t" + "cmpl $19,%%ebx\n\t" + "jle L53\n\t" + "xorl %%ebx,%%ebx\n\t" + "L53:\n\t" + "cmpl %%edx,%%edi\n\t" + "jb L54\n\t" + : : "D" (start), "d" (pe), "a" (p2), + "b" (k), "c" (offset) + ); +#endif +} + +STATIC void modtst_check(ulong* restrict start, + ulong len_dw, const void* vctx) { + const modtst_ctx* restrict ctx = (const modtst_ctx*)vctx; + ulong p1 = ctx->p1; + ulong offset = ctx->offset; + + ASSERT(offset < MOD_SZ); + +#if PREFER_C + ulong bad; + for (ulong i = offset; i < len_dw; i += MOD_SZ) { + if ((bad = start[i]) != p1) + mt86_error(start + i, p1, bad); + } +#else + ulong* p = start + offset; + ulong* pe = start + len_dw; + asm __volatile__ + ( + "jmp L70\n\t" + ".p2align 4,,7\n\t" + + "L70:\n\t" + "movl (%%edi),%%ecx\n\t" + "cmpl %%eax,%%ecx\n\t" + "jne L71\n\t" + "L72:\n\t" + "addl $80,%%edi\n\t" + "cmpl %%edx,%%edi\n\t" + "jb L70\n\t" + "jmp L73\n\t" + + "L71:\n\t" + "pushl %%edx\n\t" + "pushl %%ecx\n\t" + "pushl %%eax\n\t" + "pushl %%edi\n\t" + "call mt86_error\n\t" + "popl %%edi\n\t" + "popl %%eax\n\t" + "popl %%ecx\n\t" + "popl %%edx\n\t" + "jmp L72\n" + + "L73:\n\t" + : : "D" (p), "d" (pe), "a" (p1) + : "ecx" + ); +#endif +} + +/* + * Test all of memory using modulo X access pattern. + */ +void modtst(int offset, int iter, ulong p1, ulong p2, int me) +{ + modtst_ctx ctx; + ctx.offset = offset; + ctx.p1 = p1; + ctx.p2 = p2; + + /* Display the current pattern */ + if (mstr_cpu == me) { + hprint(LINE_PAT, COL_PAT-2, p1); + cprint(LINE_PAT, COL_PAT+6, "-"); + dprint(LINE_PAT, COL_PAT+7, offset, 2, 1); + } + + /* Write every nth location with pattern */ + sliced_foreach_segment(&ctx, me, modtst_sparse_writes); + { BAILR } + + /* Write the rest of memory "iter" times with the pattern complement */ + for (ulong i=0; i<iter; i++) { + sliced_foreach_segment(&ctx, me, modtst_dense_writes); + { BAILR } + } + + /* Now check every nth location */ + sliced_foreach_segment(&ctx, me, modtst_check); +} + +#if PREFER_C + +STATIC void movsl(ulong* dest, + ulong* src, + ulong size_in_dwords) { + /* Logically equivalent to: + + for (ulong i = 0; i < size_in_dwords; i++) + dest[i] = src[i]; + + However: the movsl instruction does the entire loop + in one instruction -- this is probably how 'memcpy' + is implemented -- so hardware makes it very fast. + + Even in PREFER_C mode, we want the brute force of movsl! + */ + asm __volatile__ + ( + "cld\n" + "jmp L1189\n\t" + + ".p2align 4,,7\n\t" + "L1189:\n\t" + + "movl %1,%%edi\n\t" // dest + "movl %0,%%esi\n\t" // src + "movl %2,%%ecx\n\t" // len in dwords + "rep\n\t" + "movsl\n\t" + + :: "g" (src), "g" (dest), "g" (size_in_dwords) + : "edi", "esi", "ecx" + ); +} +#endif // PREFER_C + +STATIC ulong block_move_normalize_len_dw(ulong len_dw) { + // The block_move test works with sets of 64-byte blocks, + // so ensure our total length is a multiple of 64. + // + // In fact, since we divide the region in half, and each half-region + // is a set of 64-byte blocks, the full region should be a multiple of 128 + // bytes. + // + // Note that there's no requirement for the start address of the region to + // be 64-byte aligned, it can be any dword. + ulong result = (len_dw >> 5) << 5; + ASSERT(result > 0); + return result; +} + +STATIC void block_move_init(ulong* restrict buf, + ulong len_dw, const void* unused_ctx) { + len_dw = block_move_normalize_len_dw(len_dw); + + // Compute 'len' in units of 64-byte chunks: + ulong len = len_dw >> 4; + + // We only need to initialize len/2, since we'll just copy + // the first half onto the second half in the move step. + len = len >> 1; + + ulong base_val = 1; +#if PREFER_C + while(len > 0) { + ulong neg_val = ~base_val; + + // Set a block of 64 bytes // first block DWORDS are: + buf[0] = base_val; // 0x00000001 + buf[1] = base_val; // 0x00000001 + buf[2] = base_val; // 0x00000001 + buf[3] = base_val; // 0x00000001 + buf[4] = neg_val; // 0xfffffffe + buf[5] = neg_val; // 0xfffffffe + buf[6] = base_val; // 0x00000001 + buf[7] = base_val; // 0x00000001 + buf[8] = base_val; // 0x00000001 + buf[9] = base_val; // 0x00000001 + buf[10] = neg_val; // 0xfffffffe + buf[11] = neg_val; // 0xfffffffe + buf[12] = base_val; // 0x00000001 + buf[13] = base_val; // 0x00000001 + buf[14] = neg_val; // 0xfffffffe + buf[15] = neg_val; // 0xfffffffe + + buf += 16; // advance to next 64-byte block + len--; + + // Rotate the bit left, including an all-zero state. + // It can't hurt to have a periodicity of 33 instead of + // a power of two. + if (base_val == 0) { + base_val = 1; + } else if (base_val & 0x80000000) { + base_val = 0; + } else { + base_val = base_val << 1; + } + } +#else + asm __volatile__ + ( + "jmp L100\n\t" + + ".p2align 4,,7\n\t" + "L100:\n\t" + + // First loop eax is 0x00000001, edx is 0xfffffffe + "movl %%eax, %%edx\n\t" + "notl %%edx\n\t" + + // Set a block of 64-bytes // First loop DWORDS are + "movl %%eax,0(%%edi)\n\t" // 0x00000001 + "movl %%eax,4(%%edi)\n\t" // 0x00000001 + "movl %%eax,8(%%edi)\n\t" // 0x00000001 + "movl %%eax,12(%%edi)\n\t" // 0x00000001 + "movl %%edx,16(%%edi)\n\t" // 0xfffffffe + "movl %%edx,20(%%edi)\n\t" // 0xfffffffe + "movl %%eax,24(%%edi)\n\t" // 0x00000001 + "movl %%eax,28(%%edi)\n\t" // 0x00000001 + "movl %%eax,32(%%edi)\n\t" // 0x00000001 + "movl %%eax,36(%%edi)\n\t" // 0x00000001 + "movl %%edx,40(%%edi)\n\t" // 0xfffffffe + "movl %%edx,44(%%edi)\n\t" // 0xfffffffe + "movl %%eax,48(%%edi)\n\t" // 0x00000001 + "movl %%eax,52(%%edi)\n\t" // 0x00000001 + "movl %%edx,56(%%edi)\n\t" // 0xfffffffe + "movl %%edx,60(%%edi)\n\t" // 0xfffffffe + + // rotate left with carry, + // second loop eax is 0x00000002 + // second loop edx is (~eax) 0xfffffffd + "rcll $1, %%eax\n\t" + + // Move current position forward 64-bytes (to start of next block) + "leal 64(%%edi), %%edi\n\t" + + // Loop until end + "decl %%ecx\n\t" + "jnz L100\n\t" + + : : "D" (buf), "c" (len), "a" (base_val) + : "edx" + ); +#endif +} + +typedef struct { + int iter; + int me; +} block_move_ctx; + +STATIC void block_move_move(ulong* restrict buf, + ulong len_dw, const void* vctx) { + const block_move_ctx* restrict ctx = (const block_move_ctx*)vctx; + ulong iter = ctx->iter; + int me = ctx->me; + + len_dw = block_move_normalize_len_dw(len_dw); + + /* Now move the data around + * First move the data up half of the segment size we are testing + * Then move the data to the original location + 32 bytes + */ + ulong half_len_dw = len_dw / 2; // Half the size of this block in DWORDS + ASSERT(half_len_dw > 8); + + ulong* mid = buf + half_len_dw; // VA at mid-point of this block. + for (int i=0; i<iter; i++) { + if (i > 0) { + // foreach_segment() called this before the 0th iteration, + // so don't tick twice in quick succession. + do_tick(me); + } + { BAILR } + +#if PREFER_C + // Move first half to 2nd half: + movsl(/*dest=*/ mid, /*src=*/ buf, half_len_dw); + + // Move the second half, less the last 8 dwords + // to the first half plus an offset of 8 dwords. + movsl(/*dest=*/ buf + 8, /*src=*/ mid, half_len_dw - 8); + + // Finally, move the last 8 dwords of the 2nd half + // to the first 8 dwords of the first half. + movsl(/*dest=*/ mid + half_len_dw - 8, /*src=*/ buf, 8); +#else + asm __volatile__ + ( + "cld\n" + "jmp L110\n\t" + + ".p2align 4,,7\n\t" + "L110:\n\t" + + // + // At the end of all this + // - the second half equals the inital value of the first half + // - the first half is right shifted 32-bytes (with wrapping) + // + + // Move first half to second half + "movl %1,%%edi\n\t" // Destination 'mid' (mid point) + "movl %0,%%esi\n\t" // Source, 'buf' (start point) + "movl %2,%%ecx\n\t" // Length, 'half_len_dw' (size of a half in DWORDS) + "rep\n\t" + "movsl\n\t" + + // Move the second half, less the last 32-bytes. To the first half, offset plus 32-bytes + "movl %0,%%edi\n\t" + "addl $32,%%edi\n\t" // Destination 'buf' plus 32 bytes + "movl %1,%%esi\n\t" // Source, 'mid' + "movl %2,%%ecx\n\t" + "subl $8,%%ecx\n\t" // Length, 'half_len_dw' + "rep\n\t" + "movsl\n\t" + + // Move last 8 DWORDS (32-bytes) of the second half to the start of the first half + "movl %0,%%edi\n\t" // Destination 'buf' + // Source, 8 DWORDS from the end of the second half, left over by the last rep/movsl + "movl $8,%%ecx\n\t" // Length, 8 DWORDS (32-bytes) + "rep\n\t" + "movsl\n\t" + + :: "g" (buf), "g" (mid), "g" (half_len_dw) + : "edi", "esi", "ecx" + ); +#endif + } +} + +STATIC void block_move_check(ulong* restrict buf, + ulong len_dw, const void* unused_ctx) { + len_dw = block_move_normalize_len_dw(len_dw); + + /* Now check the data. + * This is rather crude, we just check that the + * adjacent words are the same. + */ +#if PREFER_C + for (ulong i = 0; i < len_dw; i = i + 2) { + if (buf[i] != buf[i+1]) { + mt86_error(buf+i, buf[i], buf[i+1]); + } + } +#else + ulong* pe = buf + (len_dw - 2); + asm __volatile__ + ( + "jmp L120\n\t" + + ".p2align 4,,7\n\t" + "L124:\n\t" + "addl $8,%%edi\n\t" // Next QWORD + "L120:\n\t" + + // Compare adjacent DWORDS + "movl (%%edi),%%ecx\n\t" + "cmpl 4(%%edi),%%ecx\n\t" + "jnz L121\n\t" // Print error if they don't match + + // Loop until end of block + "L122:\n\t" + "cmpl %%edx,%%edi\n\t" + "jb L124\n" + "jmp L123\n\t" + + "L121:\n\t" + // eax not used so we don't need to save it as per cdecl + // ecx is used but not restored, however we don't need it's value anymore after this point + "pushl %%edx\n\t" + "pushl 4(%%edi)\n\t" + "pushl %%ecx\n\t" + "pushl %%edi\n\t" + "call mt86_error\n\t" + "popl %%edi\n\t" + "addl $8,%%esp\n\t" + "popl %%edx\n\t" + "jmp L122\n" + "L123:\n\t" + :: "D" (buf), "d" (pe) + : "ecx" + ); +#endif +} + +/* + * Test memory using block moves + * Adapted from Robert Redelmeier's burnBX test + */ +void block_move(int iter, int me) +{ + cprint(LINE_PAT, COL_PAT-2, " "); + + block_move_ctx ctx; + ctx.iter = iter; + ctx.me = me; + + /* Initialize memory with the initial pattern. */ + sliced_foreach_segment(&ctx, me, block_move_init); + { BAILR } + s_barrier(); + + /* Now move the data around */ + sliced_foreach_segment(&ctx, me, block_move_move); + { BAILR } + s_barrier(); + + /* And check it. */ + sliced_foreach_segment(&ctx, me, block_move_check); +} + +typedef struct { + ulong pat; +} bit_fade_ctx; + +STATIC void bit_fade_fill_seg(ulong* restrict p, + ulong len_dw, const void* vctx) { + const bit_fade_ctx* restrict ctx = (const bit_fade_ctx*)vctx; + ulong pat = ctx->pat; + + for (ulong i = 0; i < len_dw; i++) { + p[i] = pat; + } +} + +/* + * Test memory for bit fade, fill memory with pattern. + */ +void bit_fade_fill(ulong p1, int me) +{ + /* Display the current pattern */ + hprint(LINE_PAT, COL_PAT, p1); + + /* Initialize memory with the initial pattern. */ + bit_fade_ctx ctx; + ctx.pat = p1; + unsliced_foreach_segment(&ctx, me, bit_fade_fill_seg); +} + +STATIC void bit_fade_chk_seg(ulong* restrict p, + ulong len_dw, const void* vctx) { + const bit_fade_ctx* restrict ctx = (const bit_fade_ctx*)vctx; + ulong pat = ctx->pat; + + for (ulong i = 0; i < len_dw; i++) { + ulong bad; + if ((bad=p[i]) != pat) { + mt86_error(p+i, pat, bad); + } + } +} + +void bit_fade_chk(ulong p1, int me) +{ + bit_fade_ctx ctx; + ctx.pat = p1; + + /* Make sure that nothing changed while sleeping */ + unsliced_foreach_segment(&ctx, me, bit_fade_chk_seg); +} + +/* Sleep for N seconds */ +void sleep(long n, int flag, int me, + int sms /* interpret 'n' as milliseconds instead */) +{ + ulong sh, sl, l, h, t, ip=0; + + /* save the starting time */ + asm __volatile__( + "rdtsc":"=a" (sl),"=d" (sh)); + + /* loop for n seconds */ + while (1) { + asm __volatile__( + "rep ; nop\n\t" + "rdtsc":"=a" (l),"=d" (h)); + asm __volatile__ ( + "subl %2,%0\n\t" + "sbbl %3,%1" + :"=a" (l), "=d" (h) + :"g" (sl), "g" (sh), + "0" (l), "1" (h)); + + if (sms != 0) { + t = h * ((unsigned)0xffffffff / vv->clks_msec); + t += (l / vv->clks_msec); + } else { + t = h * ((unsigned)0xffffffff / vv->clks_msec) / 1000; + t += (l / vv->clks_msec) / 1000; + } + + /* Is the time up? */ + if (t >= n) { + break; + } + + /* Only display elapsed time if flag is set */ + if (flag == 0) { + continue; + } + + if (t != ip) { + do_tick(me); + { BAILR } + ip = t; + } + } +} + +void beep(unsigned int frequency) +{ +#if 0 + // BOZO(jcoiner) + // Removed this, we need to define outb_p() and inb_p() + // before reintroducing it. +#else + unsigned int count = 1193180 / frequency; + + // Switch on the speaker + outb_p(inb_p(0x61)|3, 0x61); + + // Set command for counter 2, 2 byte write + outb_p(0xB6, 0x43); + + // Select desired Hz + outb_p(count & 0xff, 0x42); + outb((count >> 8) & 0xff, 0x42); + + // Block for 100 microseconds + sleep(100, 0, 0, 1); + + // Switch off the speaker + outb(inb_p(0x61)&0xFC, 0x61); +#endif +} |