1 files changed, 446 insertions, 0 deletions
diff --git a/utils/thread_util.h b/utils/thread_util.h
new file mode 100644
index 0000000..6dff8de
--- /dev/null
+++ b/utils/thread_util.h
@@ -0,0 +1,446 @@
+/* -*- mode: c; tab-width: 4; fill-column: 78 -*- */
+/* vi: set ts=4 tw=78: */
+
+/*
+thread_util.h, Copyright (c) 2014 Dave Odell <dmo2118@gmail.com>
+
+Permission to use, copy, modify, distribute, and sell this software and its
+documentation for any purpose is hereby granted without fee, provided that
+the above copyright notice appear in all copies and that both that
+copyright notice and this permission notice appear in supporting
+documentation.  No representations are made about the suitability of this
+software for any purpose.  It is provided "as is" without express or
+implied warranty.
+*/
+
+#ifndef THREAD_UTIL_H
+#define THREAD_UTIL_H
+
+/* thread_util.h because C11 took threads.h. */
+
+/* And POSIX threads because there aren't too many systems that support C11
+   threads that don't already support POSIX threads.
+   ...Not that it would be too hard to convert from the one to the other.
+   Or to have both.
+ */
+
+/* Beware!
+   Multithreading is a great way to add insidious and catastrophic bugs to
+   a program. Make sure you understand the risks.
+
+   You may wish to become familiar with race conditions, deadlocks, mutexes,
+   condition variables, and, in lock-free code, memory ordering, cache
+   hierarchies, etc., before working with threads.
+
+   On the other hand, if a screenhack locks up or crashes, it's not the
+   end of the world: XScreenSaver won't unlock the screen if that happens.
+*/
+
+/*
+   The basic stragegy for applying threads to a CPU-hungry screenhack:
+
+   1. Find the CPU-hungry part of the hack.
+
+   2. Change that part so the workload can be divided into N equal-sized
+      loads, where N is the number of CPU cores in the machine.
+      (For example: with two cores, one core could render even scan lines,
+      and the other odd scan lines.)
+
+   2a. Keeping in mind that two threads should not write to the same memory
+       at the same time. Specifically, they should not be writing to the
+       same cache line at the same time -- so align memory allocation and
+       memory accesses to the system cache line size as necessary.
+
+   3. On screenhack_init, create a threadpool object. This creates N worker
+      threads, and each thread creates and owns a user-defined struct.
+      After creation, the threads are idle.
+
+   4. On screenhack_frame, call threadpool_run(). Each thread simultaneously
+      wakes up, calls a function that does one of the equal-sized loads,
+      then goes back to sleep. The main thread then calls threadpool_wait(),
+      which returns once all the worker threads have finished.
+
+      Using this to implement SMP won't necessarily increase performance by
+      a factor of N (again, N is CPU cores.). Both X11 and Cocoa on OS X can
+      impose a not-insignificant amount of overhead even when simply blitting
+      full-screen XImages @ 30 FPS.
+
+      On systems with simultaneous multithreading (a.k.a. Hyper-threading),
+      performance gains may be slim to non-existant.
+ */
+
+#include "aligned_malloc.h"
+
+#if HAVE_CONFIG_H
+/* For HAVE_PTHREAD. */
+#	include "config.h"
+#endif
+
+#include <stddef.h>
+
+#if HAVE_UNISTD_H
+/* For _POSIX_THREADS. */
+#	include <unistd.h>
+#endif
+
+#if defined HAVE_JWXYZ
+#	include "jwxyz.h"
+#else
+#	include <X11/Xlib.h>
+#endif
+
+#if HAVE_PTHREAD
+int threads_available(Display *dpy);
+#else
+#	define threads_available(dpy) (-1)
+#endif
+/* > 0: Threads are available. This is normally _POSIX_VERSION.
+    -1: Threads are not available.
+*/
+
+unsigned hardware_concurrency(Display *dpy);
+/* This is supposed to return the number of available CPU cores. This number
+   isn't necessarily constant: a system administrator can hotplug or
+   enable/disable CPUs on certain systems, or the system can deactivate a
+   malfunctioning core -- but these are rare.
+
+   If threads are unavailable, this function will return 1.
+
+   This function isn't fast; the result should be cached.
+*/
+
+unsigned thread_memory_alignment(Display *dpy);
+
+/* Returns the proper alignment for memory allocated by a thread that is
+   shared with other threads.
+
+   A typical CPU accesses the system RAM through a cache, and this cache is
+   divided up into cache lines - aligned chunks of memory typically 32 or 64
+   bytes in size. Cache faults cause cache lines to be populated from
+   memory. And, in a multiprocessing environment, two CPU cores can access the
+   same cache line. The consequences of this depend on the CPU model:
+
+   - x86 implements the MESI protocol [1] to maintain cache coherency between
+     CPU cores, with a serious performance penalty on both Intel [1] and AMD
+     [2].  Intel uses the term "false sharing" to describe two CPU cores
+     accessing different memory in the same cache line.
+
+   - ARM allows CPU caches to become inconsistent in this case [3]. Memory
+     fences are needed to prevent horrible non-deterministic bugs from
+     occurring.  Other CPU architectures have similar behavior to one of the
+     above, depending on whether they are "strongly-orderered" (like x86), or
+     "weakly-ordered" (like ARM).
+
+   Aligning multithreaded memory accesses according to the cache line size
+   neatly sidesteps both issues.
+
+   One complication is that CPU caches are divided up into separate levels,
+   and occasionally different levels can have different cache line sizes, so
+   to be safe this function returns the largest cache line size among all
+   levels.
+
+   If multithreading is not in effect, this returns sizeof(void *), because
+   posix_memalign(3) will error out if the alignment is set to be smaller than
+   that.
+
+   [1] Intel(R) 64 and IA-32 Architectures Optimization Reference Manual
+      (Order Number: 248966-026): 2.1.5 Cache Hierarchy
+   [2] Software Optimization Guide for AMD Family 10h Processors (Publication
+       #40546): 11.3.4 Data Sharing between Caches
+   [3] http://wanderingcoder.net/2011/04/01/arm-memory-ordering/
+*/
+
+/*
+   Note: aligned_malloc uses posix_memalign(3) when available, or malloc(3)
+   otherwise. As of SUSv2 (1997), and *probably* earlier, these are guaranteed
+   to be thread-safe. C89 does not discuss threads, or thread safety;
+   non-POSIX systems, watch out!
+   http://pubs.opengroup.org/onlinepubs/7908799/xsh/threads.html
+   http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_09.html
+*/
+
+/* int thread_malloc(void **ptr, Display *dpy, unsigned size); */
+#define thread_malloc(ptr, dpy, size) \
+  (aligned_malloc((ptr), thread_memory_alignment(dpy), (size)))
+
+/*
+   This simply does a malloc aligned to thread_memory_alignment(). See
+   above. On failure, an errno is returned, usually ENOMEM.
+
+   It's possible for two malloc()'d blocks to at least partially share the
+   same cache line. When a different thread is writing to each block, then bad
+   things can happen (see thread_memory_alignment). Better malloc()
+   implementations will divide memory into pools belonging to one thread or
+   another, causing memory blocks belonging to different threads to typically
+   be located on different memory pages (see getpagesize(2)), mitigating the
+   problem in question...but there's nothing stopping threads from passing
+   memory to each other. And it's not practical for the system to align each
+   block to 64 or 128 byte boundaries -- it's not uncommon to need lots and
+   lots of 8-32 byte allocations, and the waste could become a bit excessive.
+
+   Some rules of thumb to take away from this:
+
+   1. Use thread_alloc for memory that might be written to by a thread that
+   didn't originally allocate the object.
+
+   2. Use thread_alloc for memory that will be handed from one thread to
+   another.
+
+   3. Use malloc if a single thread allocates, reads from, writes to, and
+   frees the block of memory.
+
+   Oddly, I (Dave) have not seen this problem described anywhere else.
+*/
+
+#define thread_free(ptr) aligned_free(ptr)
+
+#if HAVE_PTHREAD
+#	if defined _POSIX_THREADS && _POSIX_THREADS >= 0
+/*
+   See The Open Group Base Specifications Issue 7, <unistd.h>, Constants for
+   Options and Option Groups
+   http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/unistd.h.html#tag_13_77_03_02
+*/
+
+#		include <pthread.h>
+
+/* Most PThread synchronization functions only fail when they are misused. */
+#		if defined NDEBUG
+#			define PTHREAD_VERIFY(expr) (void)(expr)
+#		else
+#			include <assert.h>
+#			define PTHREAD_VERIFY(expr) assert(!(expr))
+#		endif
+
+extern const pthread_mutex_t mutex_initializer;
+extern const pthread_cond_t cond_initializer;
+
+#	else
+		/* Whatever caused HAVE_PTHREAD to be defined (configure script,
+           usually) made a mistake if this is reached. */
+		/* Maybe this should be a warning. */
+#		error HAVE_PTHREAD is defined, but _POSIX_THREADS is not.
+		/* #undef HAVE_PTHREAD */
+#	endif
+#endif
+
+struct threadpool
+{
+/*	This is always the same as the count parameter fed to threadpool_create().
+	Here's a neat trick: if the threadpool is zeroed out with a memset, and
+	threadpool_create() is never called to create 0 threads, then
+	threadpool::count can be used to determine if the threadpool object was
+	ever initialized. */
+	unsigned count;
+
+	/* Copied from threadpool_class. No need for thread_create here, though. */
+	size_t thread_size;
+	void (*thread_run)(void *self);
+	void (*thread_destroy)(void *self);
+
+	void *serial_threads;
+
+#if HAVE_PTHREAD
+	pthread_mutex_t mutex;
+	pthread_cond_t cond;
+
+	/* Number of threads waiting for the startup signal. */
+	unsigned parallel_pending;
+
+	/* Number of threads still running. During startup, this is the index of the thread currently being initialized. */
+	unsigned parallel_unfinished;
+
+	pthread_t *parallel_threads;
+#endif
+};
+
+/*
+   The threadpool_* functions manage a group of threads (naturally).  Each
+   thread owns an object described by a threadpool_class. When
+   threadpool_run() is called, the specified func parameter is called on each
+   thread in parallel. Sometime after calling threadpool_run(), call
+   threadpool_wait(), which waits for each thread to return from
+   threadpool_class::run().
+
+   Note that thread 0 runs on the thread from which threadpool_run is called
+   from, so if each thread has an equal workload, then when threadpool_run
+   returns, the other threads will be finished or almost finished. Adding code
+   between threadpool_run and threadpool_wait increases the odds that
+   threadpool_wait won't actually have to wait at all -- which is nice.
+
+   If the system does not provide threads, then these functions will fake it:
+   everything will appear to work normally from the perspective of the caller,
+   but when threadpool_run() is called, the "threads" are run synchronously;
+   threadpool_wait() does nothing.
+*/
+
+struct threadpool_class
+{
+	/* Size of the thread private object. */
+	size_t size;
+
+/*	Create the thread private object. Called in sequence for each thread
+	(effectively) from threadpool_create.
+    self: A pointer to size bytes of memory, allocated to hold the thread
+          object.
+    pool: The threadpool object that owns all the threads. If the threadpool
+          is nested in another struct, try GET_PARENT_OBJ.
+    id:   The ID for the thread; numbering starts at zero and goes up by one
+          for each thread.
+    Return 0 on success. On failure, return a value from errno.h; this will
+    be returned from threadpool_create. */
+	int (*create)(void *self, struct threadpool *pool, unsigned id);
+
+/*	Destroys the thread private object. Called in sequence (though not always
+	the same sequence as create).  Warning: During shutdown, it is possible
+	for destroy() to be called while other threads are still in
+	threadpool_run(). */
+	void (*destroy)(void *self);
+};
+
+/* Returns 0 on success, on failure can return ENOMEM, or any error code from
+   threadpool_class.create. */
+int threadpool_create(struct threadpool *self, const struct threadpool_class *cls, Display *dpy, unsigned count);
+void threadpool_destroy(struct threadpool *self);
+
+void threadpool_run(struct threadpool *self, void (*func)(void *));
+void threadpool_wait(struct threadpool *self);
+
+/*
+   io_thread is meant to wrap blocking I/O operations in a one-shot worker
+   thread, with cancel semantics.
+
+   Unlike threadpool_*, io_thread will not 'fake it'; it is up to the caller
+   to figure out what to do if the system doesn't have threads. In
+   particular, the start_routine passed to io_thread_create will never be
+   called.
+
+   Clients of io_thread implement four functions:
+   - state *process_start(...);
+     Starts the worker thread.
+   - bool process_is_done(state *);
+     Returns true if the I/O operation is complete.
+   - void process_cancel(state *);
+     "Cancels" the I/O operation. The thread will continue to run, but it
+     will detach, and clean itself up upon completion.
+   - int process_finish(state *, ...)
+     Waits for the I/O operation to complete, returns results, and cleans up.
+
+   Or:        /---\
+             \/   |    /--> cancel
+   start -> is_done --+
+                       \--> finish
+
+   These functions follow a basic pattern:
+   - start:
+     1. Allocate a thread state object with thread_alloc. This state object
+        contains an io_thread member.
+     2. Save parameters from the start parameters to the state object.
+     3. Start the thread with _io_thread_create. The thread receives the state
+        object as its parameter.
+   - On the worker thread:
+     1. Do the I/O.
+     2. Call io_thread_return.
+       2a. If the result != 0, free the state object.
+   - is_done:
+     1. Just call _io_thread_is_done.
+   - cancel:
+     1. Call io_thread_cancel.
+       1a. If the result != 0, free the state object.
+   - finish:
+     1. Call io_thread_finish.
+     2. Copy results out of the state object as needed.
+     3. Free the state object...or return it to the caller.
+
+   Incidentally, there may sometimes be asynchronous versions of blocking I/O
+   functions (struct aiocb and friends, for example); these should be
+   preferred over io_thread when performance is a concern.
+ */
+
+enum _io_thread_status
+{
+	_io_thread_working, _io_thread_done, _io_thread_cancelled
+};
+
+struct io_thread
+{
+#if HAVE_PTHREAD
+	/* Common misconception: "volatile" should be applied to atomic variables,
+	   such as 'status', below. This is false, see
+	   <http://stackoverflow.com/q/2484980>. */
+	enum _io_thread_status status;
+	pthread_t thread;
+#else
+	char gcc_emits_a_warning_when_the_struct_has_no_members;
+#endif
+};
+
+#if HAVE_PTHREAD
+
+void *io_thread_create(struct io_thread *self, void *parent, void *(*start_routine)(void *), Display *dpy, unsigned stacksize);
+/*
+   Create the thread, returns NULL on failure.  Failure is usually due to
+   ENOMEM, or the system doesn't support threads.
+   self:          The io_thread object to be initialized.
+   parent:        The parameter to start_routine.  The io_thread should be
+                  contained within or be reachable from this.
+   start_routine: The start routine for the worker thread.
+   dpy:           The X11 Display, so that '*useThreads' is honored.
+   stacksize:     The stack size for the thread. Set to 0 for the system
+                  default.
+   A note about stacksize: Linux, for example, uses a default of 2 MB of
+   stack per thread. Now, this memory is usually committed on the first
+   write, so lots of threads won't waste RAM, but it does mean that on a
+   32-bit system, there's a limit of just under 1024 threads with the 2 MB
+   default due to typical address space limitations of 2 GB for userspace
+   processes.  And 1024 threads might not always be enough...
+ */
+
+int io_thread_return(struct io_thread *self);
+/* Called at the end of start_routine, from above. Returns non-zero if the
+   thread has been cancelled, and cleanup needs to take place. */
+
+int io_thread_is_done(struct io_thread *self);
+/* Call from the main thread. Returns non-zero if the thread finished. */
+
+int io_thread_cancel(struct io_thread *self);
+/* Call from the main thread if the results from the worker thread are not
+   needed. This cleans up the io_thread. Returns non-zero if cleanup needs
+   to take place. */
+
+void io_thread_finish(struct io_thread *self);
+/* Call from the main thread to wait for the worker thread to finish. This
+   cleans up the io_thread. */
+
+#else
+
+#define IO_THREAD_STACK_MIN 0
+
+#define io_thread_create(self, parent, start_routine, dpy, stacksize) NULL
+#define io_thread_return(self) 0
+#define io_thread_is_done(self) 1
+#define io_thread_cancel(self) 0
+#define io_thread_finish(self)
+
+#endif
+
+#if HAVE_PTHREAD
+#	define THREAD_DEFAULTS       "*useThreads: True",
+#	define THREAD_DEFAULTS_XLOCK "*useThreads: True\n"
+#	define THREAD_OPTIONS \
+	{"-threads",    ".useThreads", XrmoptionNoArg, "True"}, \
+	{"-no-threads", ".useThreads", XrmoptionNoArg, "False"},
+#else
+#	define THREAD_DEFAULTS
+#	define THREAD_DEFAULTS_XLOCK
+#	define THREAD_OPTIONS
+#endif
+
+/*
+   If a variable 'member' is known to be a member (named 'member_name') of a
+   struct (named 'struct_name'), then this can find a pointer to the struct
+   that contains it.
+*/
+#define GET_PARENT_OBJ(struct_name, member_name, member) (struct_name *)((char *)member - offsetof(struct_name, member_name));
+
+#endif