/* test.c - MemTest-86 Version 3.4
*
* Released under version 2 of the Gnu Public License.
* By Chris Brady
* ----------------------------------------------------
* MemTest86+ V5 Specific code (GPL V2.0)
* By Samuel DEMEULEMEESTER, sdemeule@memtest.org
* http://www.canardpc.com - http://www.memtest.org
* Thanks to Passmark for calculate_chunk() and various comments !
*/
#include "test.h"
#include "config.h"
#include "stdint.h"
#include "cpuid.h"
#include "smp.h"
#include "io.h"
extern struct cpu_ident cpu_id;
extern volatile int mstr_cpu;
extern volatile int run_cpus;
extern volatile int test;
extern volatile int segs, bail;
extern int test_ticks, nticks;
extern struct tseq tseq[];
extern void update_err_counts(void);
extern void print_err_counts(void);
void rand_seed( unsigned int seed1, unsigned int seed2, int me);
ulong rand(int me);
void poll_errors();
// NOTE(jcoiner):
// Defining 'STATIC' to empty string results in crashes. (It should
// work fine, of course.) I suspect relocation problems in reloc.c.
// When we declare these routines static, we use relative addresses
// for them instead of looking up their addresses in (supposedly
// relocated) global elf tables, which avoids the crashes.
#define STATIC static
//#define STATIC
#define PREFER_C 0
static const void* const nullptr = 0x0;
// Writes *start and *end with the VA range to test.
//
// me - this threads CPU number
// j - index into v->map for current segment we are testing
// align - number of bytes to align each block to
STATIC void calculate_chunk(ulong** start, ulong** end, int me,
int j, int makeMultipleOf) {
ulong chunk;
// If we are only running 1 CPU then test the whole block
if (run_cpus == 1) {
*start = vv->map[j].start;
*end = vv->map[j].end;
} else {
// Divide the current segment by the number of CPUs
chunk = (ulong)vv->map[j].end-(ulong)vv->map[j].start;
chunk /= run_cpus;
// Round down to the nearest desired bitlength multiple
chunk = (chunk + (makeMultipleOf-1)) & ~(makeMultipleOf-1);
// Figure out chunk boundaries
*start = (ulong*)((ulong)vv->map[j].start+(chunk*me));
/* Set end addrs for the highest CPU num to the
* end of the segment for rounding errors */
/* Also rounds down to boundary if needed, may miss some ram but
better than crashing or producing false errors. */
/* This rounding probably will never happen as the segments should
be in 4096 bytes pages if I understand correctly. */
if (me == mstr_cpu) {
*end = (ulong*)(vv->map[j].end);
} else {
*end = (ulong*)((ulong)(*start) + chunk);
(*end)--;
}
}
}
/* Call segment_fn() for each up-to-SPINSZ segment between
* 'start' and 'end'.
*/
void foreach_segment
(ulong* start, ulong* end,
int me, const void* ctx, segment_fn func) {
ASSERT(start < end);
// Confirm 'start' points to an even dword, and 'end'
// should point to an odd dword
ASSERT(0 == (((ulong)start) & 0x7));
ASSERT(0x4 == (((ulong)end) & 0x7));
// 'end' may be exactly 0xfffffffc, right at the 4GB boundary.
//
// To avoid overflow in our loop tests and length calculations,
// use dword indices (the '_dw' vars) to avoid overflows.
ulong start_dw = ((ulong)start) >> 2;
ulong end_dw = ((ulong) end) >> 2;
// end is always xxxxxffc, but increment end_dw to an
// address beyond the segment for easier boundary calculations.
++end_dw;
ulong seg_dw = start_dw;
ulong seg_end_dw = start_dw;
int done = 0;
do {
do_tick(me);
{ BAILR }
// ensure no overflow
ASSERT((seg_end_dw + SPINSZ_DWORDS) > seg_end_dw);
seg_end_dw += SPINSZ_DWORDS;
if (seg_end_dw >= end_dw) {
seg_end_dw = end_dw;
done++;
}
if (seg_dw == seg_end_dw) {
break;
}
ASSERT(((ulong)seg_end_dw) <= 0x40000000);
ASSERT(seg_end_dw > seg_dw);
ulong seg_len_dw = seg_end_dw - seg_dw;
func((ulong*)(seg_dw << 2), seg_len_dw, ctx);
seg_dw = seg_end_dw;
} while (!done);
}
/* Calls segment_fn() for each segment in vv->map.
*
* Does not slice by CPU number, so it covers the entire memory.
* Contrast to sliced_foreach_segment().
*/
STATIC void unsliced_foreach_segment
(const void* ctx, int me, segment_fn func) {
int j;
for (j=0; j<segs; j++) {
foreach_segment(vv->map[j].start,
vv->map[j].end,
me, ctx, func);
}
}
/* Calls segment_fn() for each segment to be tested by CPU 'me'.
*
* In multicore mode, slices the segments by 'me' (the CPU ordinal
* number) so that each call will cover only 1/Nth of memory.
*/
STATIC void sliced_foreach_segment
(const void *ctx, int me, segment_fn func) {
int j;
ulong *start, *end; // VAs
ulong* prev_end = 0;
for (j=0; j<segs; j++) {
calculate_chunk(&start, &end, me, j, 64);
// Ensure no overlap among chunks
ASSERT(end > start);
if (prev_end > 0) {
ASSERT(prev_end < start);
}
prev_end = end;
foreach_segment(start, end, me, ctx, func);
}
}
STATIC void addr_tst1_seg(ulong* restrict buf,
ulong len_dw, const void* unused) {
// Within each segment:
// - choose a low dword offset 'off'
// - write pat to *off
// - write ~pat to addresses that are above off by
// 1, 2, 4, ... dwords up to the top of the segment. None
// should alias to the original dword.
// - write ~pat to addresses that are below off by
// 1, 2, 4, etc dwords, down to the start of the segment. None
// should alias to the original dword. If adding a given offset
// doesn't produce a single bit address flip (because it produced
// a carry) subtracting the same offset should give a single bit flip.
// - repeat this, moving off ahead in increments of 1MB;
// this covers address bits within physical memory banks, we hope?
ulong pat;
int k;
for (pat=0x5555aaaa, k=0; k<2; k++) {
hprint(LINE_PAT, COL_PAT, pat);
for (ulong off_dw = 0; off_dw < len_dw; off_dw += (1 << 18)) {
buf[off_dw] = pat;
pat = ~pat;
for (ulong more_off_dw = 1; off_dw + more_off_dw < len_dw;
more_off_dw = more_off_dw << 1) {
ASSERT(more_off_dw); // it should never get to zero
buf[off_dw + more_off_dw] = pat;
ulong bad;
if ((bad = buf[off_dw]) != ~pat) {
ad_err1(buf + off_dw,
buf + off_dw + more_off_dw,
bad, ~pat);
break;
}
}
for (ulong more_off_dw = 1; off_dw > more_off_dw;
more_off_dw = more_off_dw << 1) {
ASSERT(more_off_dw); // it should never get to zero
buf[off_dw - more_off_dw] = pat;
ulong bad;
if ((bad = buf[off_dw]) != ~pat) {
ad_err1(buf + off_dw,
buf + off_dw - more_off_dw,
bad, ~pat);
break;
}
}
}
}
}
/*
* Memory address test, walking ones
*/
void addr_tst1(int me)
{
unsliced_foreach_segment(nullptr, me, addr_tst1_seg);
}
STATIC void addr_tst2_init_segment(ulong* p,
ulong len_dw, const void* unused) {
ulong* pe = p + (len_dw - 1);
/* Original C code replaced with hand tuned assembly code
* for (; p <= pe; p++) {
* *p = (ulong)p;
* }
*/
asm __volatile__ (
"jmp L91\n\t"
".p2align 4,,7\n\t"
"L90:\n\t"
"addl $4,%%edi\n\t"
"L91:\n\t"
"movl %%edi,(%%edi)\n\t"
"cmpl %%edx,%%edi\n\t"
"jb L90\n\t"
: : "D" (p), "d" (pe)
);
}
STATIC void addr_tst2_check_segment(ulong* p,
ulong len_dw, const void* unused) {
ulong* pe = p + (len_dw - 1);
/* Original C code replaced with hand tuned assembly code
* for (; p <= pe; p++) {
* if((bad = *p) != (ulong)p) {
* ad_err2((ulong)p, bad);
* }
* }
*/
asm __volatile__
(
"jmp L95\n\t"
".p2align 4,,7\n\t"
"L99:\n\t"
"addl $4,%%edi\n\t"
"L95:\n\t"
"movl (%%edi),%%ecx\n\t"
"cmpl %%edi,%%ecx\n\t"
"jne L97\n\t"
"L96:\n\t"
"cmpl %%edx,%%edi\n\t"
"jb L99\n\t"
"jmp L98\n\t"
"L97:\n\t"
"pushl %%edx\n\t"
"pushl %%ecx\n\t"
"pushl %%edi\n\t"
"call ad_err2\n\t"
"popl %%edi\n\t"
"popl %%ecx\n\t"
"popl %%edx\n\t"
"jmp L96\n\t"
"L98:\n\t"
: : "D" (p), "d" (pe)
: "ecx"
);
}
/*
* Memory address test, own address
*/
void addr_tst2(int me)
{
cprint(LINE_PAT, COL_PAT, "address ");
/* Write each address with its own address */
unsliced_foreach_segment(nullptr, me, addr_tst2_init_segment);
{ BAILR }
/* Each address should have its own address */
unsliced_foreach_segment(nullptr, me, addr_tst2_check_segment);
}
typedef struct {
int me;
ulong xorVal;
} movinvr_ctx;
STATIC void movinvr_init(ulong* p,
ulong len_dw, const void* vctx) {
ulong* pe = p + (len_dw - 1);
const movinvr_ctx* ctx = (const movinvr_ctx*)vctx;
/* Original C code replaced with hand tuned assembly code */
/*
for (; p <= pe; p++) {
*p = rand(me);
}
*/
asm __volatile__
(
"jmp L200\n\t"
".p2align 4,,7\n\t"
"L201:\n\t"
"addl $4,%%edi\n\t"
"L200:\n\t"
"pushl %%ecx\n\t"
"call rand\n\t"
"popl %%ecx\n\t"
"movl %%eax,(%%edi)\n\t"
"cmpl %%ebx,%%edi\n\t"
"jb L201\n\t"
: : "D" (p), "b" (pe), "c" (ctx->me)
: "eax"
);
}
STATIC void movinvr_body(ulong* p, ulong len_dw, const void* vctx) {
ulong* pe = p + (len_dw - 1);
const movinvr_ctx* ctx = (const movinvr_ctx*)vctx;
/* Original C code replaced with hand tuned assembly code */
/*for (; p <= pe; p++) {
num = rand(me);
if (i) {
num = ~num;
}
if ((bad=*p) != num) {
mt86_error((ulong*)p, num, bad);
}
*p = ~num;
}*/
asm __volatile__
(
"pushl %%ebp\n\t"
// Skip first increment
"jmp L26\n\t"
".p2align 4,,7\n\t"
// increment 4 bytes (32-bits)
"L27:\n\t"
"addl $4,%%edi\n\t"
// Check this byte
"L26:\n\t"
// Get next random number, pass in me(edx), random value returned in num(eax)
// num = rand(me);
// cdecl call maintains all registers except eax, ecx, and edx
// We maintain edx with a push and pop here using it also as an input
// we don't need the current eax value and want it to change to the return value
// we overwrite ecx shortly after this discarding its current value
"pushl %%edx\n\t" // Push function inputs onto stack
"call rand\n\t"
"popl %%edx\n\t" // Remove function inputs from stack
// XOR the random number with xorVal(ebx), which is either 0xffffffff or 0 depending on the outer loop
// if (i) { num = ~num; }
"xorl %%ebx,%%eax\n\t"
// Move the current value of the current position p(edi) into bad(ecx)
// (bad=*p)
"movl (%%edi),%%ecx\n\t"
// Compare bad(ecx) to num(eax)
"cmpl %%eax,%%ecx\n\t"
// If not equal jump the error case
"jne L23\n\t"
// Set a new value or not num(eax) at the current position p(edi)
// *p = ~num;
"L25:\n\t"
"movl $0xffffffff,%%ebp\n\t"
"xorl %%ebp,%%eax\n\t"
"movl %%eax,(%%edi)\n\t"
// Loop until current position p(edi) equals the end position pe(esi)
"cmpl %%esi,%%edi\n\t"
"jb L27\n\t"
"jmp L24\n"
// Error case
"L23:\n\t"
// Must manually maintain eax, ecx, and edx as part of cdecl call convention
"pushl %%edx\n\t"
"pushl %%ecx\n\t" // Next three pushes are functions input
"pushl %%eax\n\t"
"pushl %%edi\n\t"
"call mt86_error\n\t"
"popl %%edi\n\t" // Remove function inputs from stack and restore register values
"popl %%eax\n\t"
"popl %%ecx\n\t"
"popl %%edx\n\t"
"jmp L25\n"
"L24:\n\t"
"popl %%ebp\n\t"
:: "D" (p), "S" (pe), "b" (ctx->xorVal),
"d" (ctx->me)
: "eax", "ecx"
);
}
/*
* Test all of memory using a "half moving inversions" algorithm using random
* numbers and their complement as the data pattern. Since we are not able to
* produce random numbers in reverse order testing is only done in the forward
* direction.
*/
void movinvr(int me)
{
int i, seed1, seed2;
movinvr_ctx ctx;
ctx.me = me;
ctx.xorVal = 0;
/* Initialize memory with initial sequence of random numbers. */
if (cpu_id.fid.bits.rdtsc) {
asm __volatile__ ("rdtsc":"=a" (seed1),"=d" (seed2));
} else {
seed1 = 521288629 + vv->pass;
seed2 = 362436069 - vv->pass;
}
/* Display the current seed */
if (mstr_cpu == me) hprint(LINE_PAT, COL_PAT, seed1);
rand_seed(seed1, seed2, me);
sliced_foreach_segment(&ctx, me, movinvr_init);
{ BAILR }
/* Do moving inversions test. Check for initial pattern and then
* write the complement for each memory location.
*/
for (i=0; i<2; i++) {
rand_seed(seed1, seed2, me);
if (i) {
ctx.xorVal = 0xffffffff;
} else {
ctx.xorVal = 0;
}
sliced_foreach_segment(&ctx, me, movinvr_body);
{ BAILR }
}
}
typedef struct {
ulong p1;
ulong p2;
} movinv1_ctx;
STATIC void movinv1_init(ulong* start,
ulong len_dw, const void* vctx) {
const movinv1_ctx* ctx = (const movinv1_ctx*)vctx;
ulong p1 = ctx->p1;
ulong* p = start;
asm __volatile__
(
"rep\n\t"
"stosl\n\t"
: : "c" (len_dw), "D" (p), "a" (p1)
);
}
STATIC void movinv1_bottom_up(ulong* start,
ulong len_dw, const void* vctx) {
const movinv1_ctx* ctx = (const movinv1_ctx*)vctx;
ulong p1 = ctx->p1;
ulong p2 = ctx->p2;
ulong* p = start;
ulong* pe = p + (len_dw - 1);
// Original C code replaced with hand tuned assembly code
// seems broken
/*for (; p <= pe; p++) {
if ((bad=*p) != p1) {
mt86_error((ulong*)p, p1, bad);
}
*p = p2;
}*/
asm __volatile__
(
"jmp L2\n\t"
".p2align 4,,7\n\t"
"L0:\n\t"
"addl $4,%%edi\n\t"
"L2:\n\t"
"movl (%%edi),%%ecx\n\t"
"cmpl %%eax,%%ecx\n\t"
"jne L3\n\t"
"L5:\n\t"
"movl %%ebx,(%%edi)\n\t"
"cmpl %%edx,%%edi\n\t"
"jb L0\n\t"
"jmp L4\n"
"L3:\n\t"
"pushl %%edx\n\t"
"pushl %%ebx\n\t"
"pushl %%ecx\n\t"
"pushl %%eax\n\t"
"pushl %%edi\n\t"
"call mt86_error\n\t"
"popl %%edi\n\t"
"popl %%eax\n\t"
"popl %%ecx\n\t"
"popl %%ebx\n\t"
"popl %%edx\n\t"
"jmp L5\n"
"L4:\n\t"
:: "a" (p1), "D" (p), "d" (pe), "b" (p2)
: "ecx"
);
}
STATIC void movinv1_top_down(ulong* start,
ulong len_dw, const void* vctx) {
const movinv1_ctx* ctx = (const movinv1_ctx*)vctx;
ulong p1 = ctx->p1;
ulong p2 = ctx->p2;
ulong* p = start + (len_dw - 1);
ulong* pe = start;
//Original C code replaced with hand tuned assembly code
// seems broken
/*do {
if ((bad=*p) != p2) {
mt86_error((ulong*)p, p2, bad);
}
*p = p1;
} while (--p >= pe);*/
asm __volatile__
(
"jmp L9\n\t"
".p2align 4,,7\n\t"
"L11:\n\t"
"subl $4, %%edi\n\t"
"L9:\n\t"
"movl (%%edi),%%ecx\n\t"
"cmpl %%ebx,%%ecx\n\t"
"jne L6\n\t"
"L10:\n\t"
"movl %%eax,(%%edi)\n\t"
"cmpl %%edi, %%edx\n\t"
"jne L11\n\t"
"jmp L7\n\t"
"L6:\n\t"
"pushl %%edx\n\t"
"pushl %%eax\n\t"
"pushl %%ecx\n\t"
"pushl %%ebx\n\t"
"pushl %%edi\n\t"
"call mt86_error\n\t"
"popl %%edi\n\t"
"popl %%ebx\n\t"
"popl %%ecx\n\t"
"popl %%eax\n\t"
"popl %%edx\n\t"
"jmp L10\n"
"L7:\n\t"
:: "a" (p1), "D" (p), "d" (pe), "b" (p2)
: "ecx"
);
}
/*
* Test all of memory using a "moving inversions" algorithm using the
* pattern in p1 and its complement in p2.
*/
void movinv1 (int iter, ulong p1, ulong p2, int me)
{
int i;
/* Display the current pattern */
if (mstr_cpu == me) hprint(LINE_PAT, COL_PAT, p1);
movinv1_ctx ctx;
ctx.p1 = p1;
ctx.p2 = p2;
sliced_foreach_segment(&ctx, me, movinv1_init);
{ BAILR }
/* Do moving inversions test. Check for initial pattern and then
* write the complement for each memory location. Test from bottom
* up and then from the top down. */
for (i=0; i<iter; i++) {
sliced_foreach_segment(&ctx, me, movinv1_bottom_up);
{ BAILR }
// NOTE(jcoiner):
// For the top-down pass, the original 5.01 code iterated over
// 'segs' in from n-1 down to 0, and then within each mapped segment,
// it would form the SPINSZ windows from the top down -- thus forming
// a different set of windows than the bottom-up pass, when the segment
// is not an integer number of windows.
//
// My guess is that this buys us very little additional coverage, that
// the value in going top-down happens at the word or cache-line level
// and that there's little to be gained from reversing the direction of
// the outer loops. So I'm leaving a 'direction' bit off of the
// foreach_segment() routines for now.
sliced_foreach_segment(&ctx, me, movinv1_top_down);
{ BAILR }
}
}
typedef struct {
ulong p1;
ulong lb;
ulong hb;
int sval;
int off;
} movinv32_ctx;
STATIC void movinv32_init(ulong* restrict buf,
ulong len_dw, const void* vctx) {
const movinv32_ctx* restrict ctx = (const movinv32_ctx*)vctx;
ulong* p = buf;
ulong* pe = buf + (len_dw - 1);
int k = ctx->off;
ulong pat = ctx->p1;
ulong lb = ctx->lb;
int sval = ctx->sval;
/* Original C code replaced with hand tuned assembly code
* while (p <= pe) {
* *p = pat;
* if (++k >= 32) {
* pat = lb;
* k = 0;
* } else {
* pat = pat << 1;
* pat |= sval;
* }
* p++;
* }
*/
asm __volatile__
(
"jmp L20\n\t"
".p2align 4,,7\n\t"
"L923:\n\t"
"addl $4,%%edi\n\t"
"L20:\n\t"
"movl %%ecx,(%%edi)\n\t"
"addl $1,%%ebx\n\t"
"cmpl $32,%%ebx\n\t"
"jne L21\n\t"
"movl %%esi,%%ecx\n\t"
"xorl %%ebx,%%ebx\n\t"
"jmp L22\n"
"L21:\n\t"
"shll $1,%%ecx\n\t"
"orl %%eax,%%ecx\n\t"
"L22:\n\t"
"cmpl %%edx,%%edi\n\t"
"jb L923\n\t"
:: "D" (p),"d" (pe),"b" (k),"c" (pat),
"a" (sval), "S" (lb)
);
}
STATIC void movinv32_bottom_up(ulong* restrict buf, ulong len_dw,
const void* vctx) {
const movinv32_ctx* restrict ctx = (const movinv32_ctx*)vctx;
ulong* p = buf;
ulong* pe = buf + (len_dw - 1);
int k = ctx->off;
ulong pat = ctx->p1;
ulong lb = ctx->lb;
int sval = ctx->sval;
/* Original C code replaced with hand tuned assembly code
* while (1) {
* if ((bad=*p) != pat) {
* mt86_error((ulong*)p, pat, bad);
* }
* *p = ~pat;
* if (p >= pe) break;
* p++;
*
* if (++k >= 32) {
* pat = lb;
* k = 0;
* } else {
* pat = pat << 1;
* pat |= sval;
* }
* }
*/
asm __volatile__
(
"pushl %%ebp\n\t"
"jmp L30\n\t"
".p2align 4,,7\n\t"
"L930:\n\t"
"addl $4,%%edi\n\t"
"L30:\n\t"
"movl (%%edi),%%ebp\n\t"
"cmpl %%ecx,%%ebp\n\t"
"jne L34\n\t"
"L35:\n\t"
"notl %%ecx\n\t"
"movl %%ecx,(%%edi)\n\t"
"notl %%ecx\n\t"
"incl %%ebx\n\t"
"cmpl $32,%%ebx\n\t"
"jne L31\n\t"
"movl %%esi,%%ecx\n\t"
"xorl %%ebx,%%ebx\n\t"
"jmp L32\n"
"L31:\n\t"
"shll $1,%%ecx\n\t"
"orl %%eax,%%ecx\n\t"
"L32:\n\t"
"cmpl %%edx,%%edi\n\t"
"jb L930\n\t"
"jmp L33\n\t"
"L34:\n\t"
"pushl %%esi\n\t"
"pushl %%eax\n\t"
"pushl %%ebx\n\t"
"pushl %%edx\n\t"
"pushl %%ebp\n\t"
"pushl %%ecx\n\t"
"pushl %%edi\n\t"
"call mt86_error\n\t"
"popl %%edi\n\t"
"popl %%ecx\n\t"
"popl %%ebp\n\t"
"popl %%edx\n\t"
"popl %%ebx\n\t"
"popl %%eax\n\t"
"popl %%esi\n\t"
"jmp L35\n"
"L33:\n\t"
"popl %%ebp\n\t"
: "=b" (k),"=c" (pat)
: "D" (p),"d" (pe),"b" (k),"c" (pat),
"a" (sval), "S" (lb)
);
}
STATIC void movinv32_top_down(ulong* restrict buf,
ulong len_dw, const void* vctx) {
const movinv32_ctx* restrict ctx = (const movinv32_ctx*)vctx;
ulong* pe = buf;
ulong* p = buf + (len_dw - 1);
int k = ctx->off;
ulong pat = ctx->p1;
ulong hb = ctx->hb;
int sval = ctx->sval;
ulong p3 = (ulong)sval << 31;
// Advance 'k' and 'pat' to where they would have been
// at the end of the corresponding bottom_up segment.
//
// The '-1' is because we didn't advance 'k' or 'pat'
// on the final bottom_up loop, so they're off by one...
ulong mod_len = (len_dw - 1) % 32;
for (int i = 0; i < mod_len; i++) {
if (++k >= 32) {
pat = ctx->lb;
k = 0;
} else {
pat = pat << 1;
pat |= sval;
}
}
// Increment 'k' only because the code below has an off-by-one
// interpretation of 'k' relative to the bottom_up routine.
// There it ranges from 0:31, and here it ranges from 1:32.
k++;
/* Original C code replaced with hand tuned assembly code */
#if PREFER_C
ulong bad;
while(1) {
if ((bad=*p) != ~pat) {
mt86_error((ulong*)p, ~pat, bad);
}
*p = pat;
if (p <= pe) break;
p--;
if (--k <= 0) {
k = 32;
pat = hb;
} else {
pat = pat >> 1;
pat |= p3;
}
};
#else
asm __volatile__
(
"pushl %%ebp\n\t"
"jmp L40\n\t"
".p2align 4,,7\n\t"
"L49:\n\t"
"subl $4,%%edi\n\t"
"L40:\n\t"
"movl (%%edi),%%ebp\n\t"
"notl %%ecx\n\t"
"cmpl %%ecx,%%ebp\n\t"
"jne L44\n\t"
"L45:\n\t"
"notl %%ecx\n\t"
"movl %%ecx,(%%edi)\n\t"
"decl %%ebx\n\t"
"cmpl $0,%%ebx\n\t"
"jg L41\n\t"
"movl %%esi,%%ecx\n\t"
"movl $32,%%ebx\n\t"
"jmp L42\n"
"L41:\n\t"
"shrl $1,%%ecx\n\t"
"orl %%eax,%%ecx\n\t"
"L42:\n\t"
"cmpl %%edx,%%edi\n\t"
"ja L49\n\t"
"jmp L43\n\t"
"L44:\n\t"
"pushl %%esi\n\t"
"pushl %%eax\n\t"
"pushl %%ebx\n\t"
"pushl %%edx\n\t"
"pushl %%ebp\n\t"
"pushl %%ecx\n\t"
"pushl %%edi\n\t"
"call mt86_error\n\t"
"popl %%edi\n\t"
"popl %%ecx\n\t"
"popl %%ebp\n\t"
"popl %%edx\n\t"
"popl %%ebx\n\t"
"popl %%eax\n\t"
"popl %%esi\n\t"
"jmp L45\n"
"L43:\n\t"
"popl %%ebp\n\t"
: : "D" (p),"d" (pe),"b" (k),"c" (pat),
"a" (p3), "S" (hb)
);
#endif
}
void movinv32(int iter, ulong p1, ulong lb, ulong hb, int sval, int off,int me)
{
// First callsite:
// - p1 has 1 bit set (somewhere)
// - lb = 1 ("low bit")
// - hb = 0x80000000 ("high bit")
// - sval = 0
// - 'off' indicates the position of the set bit in p1
//
// Second callsite is the same, but inverted:
// - p1 has 1 bit clear (somewhere)
// - lb = 0xfffffffe
// - hb = 0x7fffffff
// - sval = 1
// - 'off' indicates the position of the cleared bit in p1
movinv32_ctx ctx;
ctx.p1 = p1;
ctx.lb = lb;
ctx.hb = hb;
ctx.sval = sval;
ctx.off = off;
/* Display the current pattern */
if (mstr_cpu == me) hprint(LINE_PAT, COL_PAT, p1);
sliced_foreach_segment(&ctx, me, movinv32_init);
{ BAILR }
/* Do moving inversions test. Check for initial pattern and then
* write the complement for each memory location. Test from bottom
* up and then from the top down. */
for (int i=0; i<iter; i++) {
sliced_foreach_segment(&ctx, me, movinv32_bottom_up);
{ BAILR }
sliced_foreach_segment(&ctx, me, movinv32_top_down);
{ BAILR }
}
}
typedef struct {
int offset;
ulong p1;
ulong p2;
} modtst_ctx;
STATIC void modtst_sparse_writes(ulong* restrict start,
ulong len_dw, const void* vctx) {
const modtst_ctx* restrict ctx = (const modtst_ctx*)vctx;
ulong p1 = ctx->p1;
ulong offset = ctx->offset;
#if PREFER_C
for (ulong i = offset; i < len_dw; i += MOD_SZ) {
start[i] = p1;
}
#else
ulong* p = start + offset;
ulong* pe = start + len_dw;
asm __volatile__
(
"jmp L60\n\t"
".p2align 4,,7\n\t"
"L60:\n\t"
"movl %%eax,(%%edi)\n\t"
"addl $80,%%edi\n\t"
"cmpl %%edx,%%edi\n\t"
"jb L60\n\t"
:: "D" (p), "d" (pe), "a" (p1)
);
#endif
}
STATIC void modtst_dense_writes(ulong* restrict start, ulong len_dw,
const void* vctx) {
const modtst_ctx* restrict ctx = (const modtst_ctx*)vctx;
ulong p2 = ctx->p2;
ulong offset = ctx->offset;
ASSERT(offset < MOD_SZ);
ulong k = 0;
#if PREFER_C
for (ulong i = 0; i < len_dw; i++) {
if (k != offset) {
start[i] = p2;
}
if (++k >= MOD_SZ) {
k = 0;
}
}
#else
ulong* pe = start + (len_dw - 1);
asm __volatile__
(
"jmp L50\n\t"
".p2align 4,,7\n\t"
"L54:\n\t"
"addl $4,%%edi\n\t"
"L50:\n\t"
"cmpl %%ebx,%%ecx\n\t"
"je L52\n\t"
"movl %%eax,(%%edi)\n\t"
"L52:\n\t"
"incl %%ebx\n\t"
"cmpl $19,%%ebx\n\t"
"jle L53\n\t"
"xorl %%ebx,%%ebx\n\t"
"L53:\n\t"
"cmpl %%edx,%%edi\n\t"
"jb L54\n\t"
: : "D" (start), "d" (pe), "a" (p2),
"b" (k), "c" (offset)
);
#endif
}
STATIC void modtst_check(ulong* restrict start,
ulong len_dw, const void* vctx) {
const modtst_ctx* restrict ctx = (const modtst_ctx*)vctx;
ulong p1 = ctx->p1;
ulong offset = ctx->offset;
ASSERT(offset < MOD_SZ);
#if PREFER_C
ulong bad;
for (ulong i = offset; i < len_dw; i += MOD_SZ) {
if ((bad = start[i]) != p1)
mt86_error(start + i, p1, bad);
}
#else
ulong* p = start + offset;
ulong* pe = start + len_dw;
asm __volatile__
(
"jmp L70\n\t"
".p2align 4,,7\n\t"
"L70:\n\t"
"movl (%%edi),%%ecx\n\t"
"cmpl %%eax,%%ecx\n\t"
"jne L71\n\t"
"L72:\n\t"
"addl $80,%%edi\n\t"
"cmpl %%edx,%%edi\n\t"
"jb L70\n\t"
"jmp L73\n\t"
"L71:\n\t"
"pushl %%edx\n\t"
"pushl %%ecx\n\t"
"pushl %%eax\n\t"
"pushl %%edi\n\t"
"call mt86_error\n\t"
"popl %%edi\n\t"
"popl %%eax\n\t"
"popl %%ecx\n\t"
"popl %%edx\n\t"
"jmp L72\n"
"L73:\n\t"
: : "D" (p), "d" (pe), "a" (p1)
: "ecx"
);
#endif
}
/*
* Test all of memory using modulo X access pattern.
*/
void modtst(int offset, int iter, ulong p1, ulong p2, int me)
{
modtst_ctx ctx;
ctx.offset = offset;
ctx.p1 = p1;
ctx.p2 = p2;
/* Display the current pattern */
if (mstr_cpu == me) {
hprint(LINE_PAT, COL_PAT-2, p1);
cprint(LINE_PAT, COL_PAT+6, "-");
dprint(LINE_PAT, COL_PAT+7, offset, 2, 1);
}
/* Write every nth location with pattern */
sliced_foreach_segment(&ctx, me, modtst_sparse_writes);
{ BAILR }
/* Write the rest of memory "iter" times with the pattern complement */
for (ulong i=0; i<iter; i++) {
sliced_foreach_segment(&ctx, me, modtst_dense_writes);
{ BAILR }
}
/* Now check every nth location */
sliced_foreach_segment(&ctx, me, modtst_check);
}
#if PREFER_C
STATIC void movsl(ulong* dest,
ulong* src,
ulong size_in_dwords) {
/* Logically equivalent to:
for (ulong i = 0; i < size_in_dwords; i++)
dest[i] = src[i];
However: the movsl instruction does the entire loop
in one instruction -- this is probably how 'memcpy'
is implemented -- so hardware makes it very fast.
Even in PREFER_C mode, we want the brute force of movsl!
*/
asm __volatile__
(
"cld\n"
"jmp L1189\n\t"
".p2align 4,,7\n\t"
"L1189:\n\t"
"movl %1,%%edi\n\t" // dest
"movl %0,%%esi\n\t" // src
"movl %2,%%ecx\n\t" // len in dwords
"rep\n\t"
"movsl\n\t"
:: "g" (src), "g" (dest), "g" (size_in_dwords)
: "edi", "esi", "ecx"
);
}
#endif // PREFER_C
STATIC ulong block_move_normalize_len_dw(ulong len_dw) {
// The block_move test works with sets of 64-byte blocks,
// so ensure our total length is a multiple of 64.
//
// In fact, since we divide the region in half, and each half-region
// is a set of 64-byte blocks, the full region should be a multiple of 128
// bytes.
//
// Note that there's no requirement for the start address of the region to
// be 64-byte aligned, it can be any dword.
ulong result = (len_dw >> 5) << 5;
ASSERT(result > 0);
return result;
}
STATIC void block_move_init(ulong* restrict buf,
ulong len_dw, const void* unused_ctx) {
len_dw = block_move_normalize_len_dw(len_dw);
// Compute 'len' in units of 64-byte chunks:
ulong len = len_dw >> 4;
// We only need to initialize len/2, since we'll just copy
// the first half onto the second half in the move step.
len = len >> 1;
ulong base_val = 1;
#if PREFER_C
while(len > 0) {
ulong neg_val = ~base_val;
// Set a block of 64 bytes // first block DWORDS are:
buf[0] = base_val; // 0x00000001
buf[1] = base_val; // 0x00000001
buf[2] = base_val; // 0x00000001
buf[3] = base_val; // 0x00000001
buf[4] = neg_val; // 0xfffffffe
buf[5] = neg_val; // 0xfffffffe
buf[6] = base_val; // 0x00000001
buf[7] = base_val; // 0x00000001
buf[8] = base_val; // 0x00000001
buf[9] = base_val; // 0x00000001
buf[10] = neg_val; // 0xfffffffe
buf[11] = neg_val; // 0xfffffffe
buf[12] = base_val; // 0x00000001
buf[13] = base_val; // 0x00000001
buf[14] = neg_val; // 0xfffffffe
buf[15] = neg_val; // 0xfffffffe
buf += 16; // advance to next 64-byte block
len--;
// Rotate the bit left, including an all-zero state.
// It can't hurt to have a periodicity of 33 instead of
// a power of two.
if (base_val == 0) {
base_val = 1;
} else if (base_val & 0x80000000) {
base_val = 0;
} else {
base_val = base_val << 1;
}
}
#else
asm __volatile__
(
"jmp L100\n\t"
".p2align 4,,7\n\t"
"L100:\n\t"
// First loop eax is 0x00000001, edx is 0xfffffffe
"movl %%eax, %%edx\n\t"
"notl %%edx\n\t"
// Set a block of 64-bytes // First loop DWORDS are
"movl %%eax,0(%%edi)\n\t" // 0x00000001
"movl %%eax,4(%%edi)\n\t" // 0x00000001
"movl %%eax,8(%%edi)\n\t" // 0x00000001
"movl %%eax,12(%%edi)\n\t" // 0x00000001
"movl %%edx,16(%%edi)\n\t" // 0xfffffffe
"movl %%edx,20(%%edi)\n\t" // 0xfffffffe
"movl %%eax,24(%%edi)\n\t" // 0x00000001
"movl %%eax,28(%%edi)\n\t" // 0x00000001
"movl %%eax,32(%%edi)\n\t" // 0x00000001
"movl %%eax,36(%%edi)\n\t" // 0x00000001
"movl %%edx,40(%%edi)\n\t" // 0xfffffffe
"movl %%edx,44(%%edi)\n\t" // 0xfffffffe
"movl %%eax,48(%%edi)\n\t" // 0x00000001
"movl %%eax,52(%%edi)\n\t" // 0x00000001
"movl %%edx,56(%%edi)\n\t" // 0xfffffffe
"movl %%edx,60(%%edi)\n\t" // 0xfffffffe
// rotate left with carry,
// second loop eax is 0x00000002
// second loop edx is (~eax) 0xfffffffd
"rcll $1, %%eax\n\t"
// Move current position forward 64-bytes (to start of next block)
"leal 64(%%edi), %%edi\n\t"
// Loop until end
"decl %%ecx\n\t"
"jnz L100\n\t"
: : "D" (buf), "c" (len), "a" (base_val)
: "edx"
);
#endif
}
typedef struct {
int iter;
int me;
} block_move_ctx;
STATIC void block_move_move(ulong* restrict buf,
ulong len_dw, const void* vctx) {
const block_move_ctx* restrict ctx = (const block_move_ctx*)vctx;
ulong iter = ctx->iter;
int me = ctx->me;
len_dw = block_move_normalize_len_dw(len_dw);
/* Now move the data around
* First move the data up half of the segment size we are testing
* Then move the data to the original location + 32 bytes
*/
ulong half_len_dw = len_dw / 2; // Half the size of this block in DWORDS
ASSERT(half_len_dw > 8);
ulong* mid = buf + half_len_dw; // VA at mid-point of this block.
for (int i=0; i<iter; i++) {
if (i > 0) {
// foreach_segment() called this before the 0th iteration,
// so don't tick twice in quick succession.
do_tick(me);
}
{ BAILR }
#if PREFER_C
// Move first half to 2nd half:
movsl(/*dest=*/ mid, /*src=*/ buf, half_len_dw);
// Move the second half, less the last 8 dwords
// to the first half plus an offset of 8 dwords.
movsl(/*dest=*/ buf + 8, /*src=*/ mid, half_len_dw - 8);
// Finally, move the last 8 dwords of the 2nd half
// to the first 8 dwords of the first half.
movsl(/*dest=*/ mid + half_len_dw - 8, /*src=*/ buf, 8);
#else
asm __volatile__
(
"cld\n"
"jmp L110\n\t"
".p2align 4,,7\n\t"
"L110:\n\t"
//
// At the end of all this
// - the second half equals the inital value of the first half
// - the first half is right shifted 32-bytes (with wrapping)
//
// Move first half to second half
"movl %1,%%edi\n\t" // Destination 'mid' (mid point)
"movl %0,%%esi\n\t" // Source, 'buf' (start point)
"movl %2,%%ecx\n\t" // Length, 'half_len_dw' (size of a half in DWORDS)
"rep\n\t"
"movsl\n\t"
// Move the second half, less the last 32-bytes. To the first half, offset plus 32-bytes
"movl %0,%%edi\n\t"
"addl $32,%%edi\n\t" // Destination 'buf' plus 32 bytes
"movl %1,%%esi\n\t" // Source, 'mid'
"movl %2,%%ecx\n\t"
"subl $8,%%ecx\n\t" // Length, 'half_len_dw'
"rep\n\t"
"movsl\n\t"
// Move last 8 DWORDS (32-bytes) of the second half to the start of the first half
"movl %0,%%edi\n\t" // Destination 'buf'
// Source, 8 DWORDS from the end of the second half, left over by the last rep/movsl
"movl $8,%%ecx\n\t" // Length, 8 DWORDS (32-bytes)
"rep\n\t"
"movsl\n\t"
:: "g" (buf), "g" (mid), "g" (half_len_dw)
: "edi", "esi", "ecx"
);
#endif
}
}
STATIC void block_move_check(ulong* restrict buf,
ulong len_dw, const void* unused_ctx) {
len_dw = block_move_normalize_len_dw(len_dw);
/* Now check the data.
* This is rather crude, we just check that the
* adjacent words are the same.
*/
#if PREFER_C
for (ulong i = 0; i < len_dw; i = i + 2) {
if (buf[i] != buf[i+1]) {
mt86_error(buf+i, buf[i], buf[i+1]);
}
}
#else
ulong* pe = buf + (len_dw - 2);
asm __volatile__
(
"jmp L120\n\t"
".p2align 4,,7\n\t"
"L124:\n\t"
"addl $8,%%edi\n\t" // Next QWORD
"L120:\n\t"
// Compare adjacent DWORDS
"movl (%%edi),%%ecx\n\t"
"cmpl 4(%%edi),%%ecx\n\t"
"jnz L121\n\t" // Print error if they don't match
// Loop until end of block
"L122:\n\t"
"cmpl %%edx,%%edi\n\t"
"jb L124\n"
"jmp L123\n\t"
"L121:\n\t"
// eax not used so we don't need to save it as per cdecl
// ecx is used but not restored, however we don't need it's value anymore after this point
"pushl %%edx\n\t"
"pushl 4(%%edi)\n\t"
"pushl %%ecx\n\t"
"pushl %%edi\n\t"
"call mt86_error\n\t"
"popl %%edi\n\t"
"addl $8,%%esp\n\t"
"popl %%edx\n\t"
"jmp L122\n"
"L123:\n\t"
:: "D" (buf), "d" (pe)
: "ecx"
);
#endif
}
/*
* Test memory using block moves
* Adapted from Robert Redelmeier's burnBX test
*/
void block_move(int iter, int me)
{
cprint(LINE_PAT, COL_PAT-2, " ");
block_move_ctx ctx;
ctx.iter = iter;
ctx.me = me;
/* Initialize memory with the initial pattern. */
sliced_foreach_segment(&ctx, me, block_move_init);
{ BAILR }
s_barrier();
/* Now move the data around */
sliced_foreach_segment(&ctx, me, block_move_move);
{ BAILR }
s_barrier();
/* And check it. */
sliced_foreach_segment(&ctx, me, block_move_check);
}
typedef struct {
ulong pat;
} bit_fade_ctx;
STATIC void bit_fade_fill_seg(ulong* restrict p,
ulong len_dw, const void* vctx) {
const bit_fade_ctx* restrict ctx = (const bit_fade_ctx*)vctx;
ulong pat = ctx->pat;
for (ulong i = 0; i < len_dw; i++) {
p[i] = pat;
}
}
/*
* Test memory for bit fade, fill memory with pattern.
*/
void bit_fade_fill(ulong p1, int me)
{
/* Display the current pattern */
hprint(LINE_PAT, COL_PAT, p1);
/* Initialize memory with the initial pattern. */
bit_fade_ctx ctx;
ctx.pat = p1;
unsliced_foreach_segment(&ctx, me, bit_fade_fill_seg);
}
STATIC void bit_fade_chk_seg(ulong* restrict p,
ulong len_dw, const void* vctx) {
const bit_fade_ctx* restrict ctx = (const bit_fade_ctx*)vctx;
ulong pat = ctx->pat;
for (ulong i = 0; i < len_dw; i++) {
ulong bad;
if ((bad=p[i]) != pat) {
mt86_error(p+i, pat, bad);
}
}
}
void bit_fade_chk(ulong p1, int me)
{
bit_fade_ctx ctx;
ctx.pat = p1;
/* Make sure that nothing changed while sleeping */
unsliced_foreach_segment(&ctx, me, bit_fade_chk_seg);
}
/* Sleep for N seconds */
void sleep(long n, int flag, int me,
int sms /* interpret 'n' as milliseconds instead */)
{
ulong sh, sl, l, h, t, ip=0;
/* save the starting time */
asm __volatile__(
"rdtsc":"=a" (sl),"=d" (sh));
/* loop for n seconds */
while (1) {
asm __volatile__(
"rep ; nop\n\t"
"rdtsc":"=a" (l),"=d" (h));
asm __volatile__ (
"subl %2,%0\n\t"
"sbbl %3,%1"
:"=a" (l), "=d" (h)
:"g" (sl), "g" (sh),
"0" (l), "1" (h));
if (sms != 0) {
t = h * ((unsigned)0xffffffff / vv->clks_msec);
t += (l / vv->clks_msec);
} else {
t = h * ((unsigned)0xffffffff / vv->clks_msec) / 1000;
t += (l / vv->clks_msec) / 1000;
}
/* Is the time up? */
if (t >= n) {
break;
}
/* Only display elapsed time if flag is set */
if (flag == 0) {
continue;
}
if (t != ip) {
do_tick(me);
{ BAILR }
ip = t;
}
}
}
void beep(unsigned int frequency)
{
#if 0
// BOZO(jcoiner)
// Removed this, we need to define outb_p() and inb_p()
// before reintroducing it.
#else
unsigned int count = 1193180 / frequency;
// Switch on the speaker
outb_p(inb_p(0x61)|3, 0x61);
// Set command for counter 2, 2 byte write
outb_p(0xB6, 0x43);
// Select desired Hz
outb_p(count & 0xff, 0x42);
outb((count >> 8) & 0xff, 0x42);
// Block for 100 microseconds
sleep(100, 0, 0, 1);
// Switch off the speaker
outb(inb_p(0x61)&0xFC, 0x61);
#endif
}