summaryrefslogtreecommitdiffstats
path: root/utils/aligned_malloc.c
blob: 8c45fc858208d007f3760296666071c57c125f3d (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
/* -*- mode: c; tab-width: 4; fill-column: 128 -*- */
/* vi: set ts=4 tw=128: */

/*
aligned_malloc.c, Copyright (c) 2014 Dave Odell <dmo2118@gmail.com>

Permission to use, copy, modify, distribute, and sell this software and its
documentation for any purpose is hereby granted without fee, provided that
the above copyright notice appear in all copies and that both that
copyright notice and this permission notice appear in supporting
documentation.  No representations are made about the suitability of this
software for any purpose.  It is provided "as is" without express or
implied warranty.
*/

#if HAVE_CONFIG_H
#	include "config.h"
#endif

#include "aligned_malloc.h"

#include <stddef.h>
#include <stdlib.h>

#include <assert.h>
#include <errno.h>

/* Might be changed by thread_util.c:threads_available()
 */
unsigned int aligned_malloc_default_alignment = sizeof(void *);


#if HAVE_UNISTD_H
#	include <unistd.h>
#endif

#if defined __MACH__ && defined __APPLE__ /* OS X, iOS */
#	include <sys/sysctl.h>
#	include <inttypes.h>
#endif

#define IS_POWER_OF_2(x) ((x) > 0 && !((x) & ((x) - 1)))

/*
   arraysize(a). Also known as countof(x), XtNumber(x), NELEMS(x), LEN(x),
   NUMOF(x), ARRAY_SIZE(x), etc., since the fine folks behind C never got
   around to including this incredibly useful macro in the standard library,
   which is where it belongs.

   Much of the code here assumes that multiple processors in a system all use
   the same cache line size...which might be wrong on occasion.
*/

#define arraysize(a) (sizeof(a) / sizeof(*(a)))
#define arrayend(a) ((a) + arraysize(a))

/*
These numbers are from:
- Linux: arch/(arch name)/include/asm/cache.h, note
  L1_CACHE_BYTES/L1_CACHE_SHIFT/SMP_CACHE_BYTES.
- FreeBSD: sys/(sys name)/include/param.h, note
  CACHE_LINE_SHIFT/CACHE_LINE_SIZE.

Preprocessor symbols come from:
- TARGET_CPU_CPP_BUILTINS() in the GNU C preprocessor
  <http://code.ohloh.net/?s=%22TARGET_CPU_CPP_BUILTINS%22&fp=304413>
- http://predef.sourceforge.net/
*/

/*
Several architectures need preprocessor symbols.

Qualcomm Hexagon: 1 << 5
Imagination Technologies META: 1 << 6
OpenRISC: 16 (Linux has the cache line size as a todo.)
Unicore: 1 << 5
*/

#if defined __MACH__ && defined __APPLE__ /* OS X, iOS */
#	include <TargetConditionals.h> /* For TARGET_OS_IPHONE. */
#	ifdef TARGET_OS_IPHONE
#		define _CACHE_LINE_SIZE 64
#	endif
#endif

#if defined __FreeBSD__ && !defined _CACHE_LINE_SIZE
#	include <machine/param.h>
#	ifdef CACHE_LINE_SIZE
#		define _CACHE_LINE_SIZE CACHE_LINE_SIZE
#	endif
#endif

#if !defined _CACHE_LINE_SIZE
#	if defined __alpha || defined __alpha__
/* DEC Alpha */
#		define _CACHE_LINE_SIZE 64 /* EV6 and above. EV4 and EV5 use 32 bytes. */
#	elif defined __arm__
/* ARM architecture */
#		define _CACHE_LINE_SIZE (1 << 6)
#	elif defined __AVR || defined __AVR__
/* Atmel AVR32 */
#		define _CACHE_LINE_SIZE (1 << 5)
#	elif defined __bfin || defined __BFIN__
/* Analog Devices Blackfin */
#		define _CACHE_LINE_SIZE (1 << 5)
#	elif defined _TMS320C6X || defined __TMS320C6X__
/* Texas Instruments TMS320C6x */
#		define _CACHE_LINE_SIZE (1 << 7) /* From L2. L1 data cache line is 1 << 6. */
#	elif defined __cris
/* Axis Communications ETRAX CRIS */
#		define _CACHE_LINE_SIZE 32
#	elif defined __ia64__ || defined _IA64
/* Intel Itanium */
#		define _CACHE_LINE_SIZE (1 << 7)
#	elif defined __M32R__ || defined __m32r__
/* Mitsubishi/Renesas M32R */
#		define _CACHE_LINE_SIZE (1 << 4)
#	elif defined __m68k__ || defined M68000 || defined __MC68K__
/* Motorola 68000 */
#		define _CACHE_LINE_SIZE (1 << 4)
#	elif defined __MICROBLAZE__ || defined __microblaze__
/* Xilinx MicroBlaze */
#		define _CACHE_LINE_SIZE (1 << 5)
#	elif defined __mips__ || defined __mips || defined __MIPS__
/* MIPS */
#		define _CACHE_LINE_SIZE (1 << 6)
#	elif defined __mn10300__ || defined __MN10300__
/* Matsushita/Panasonic MN103 */
#		define _CACHE_LINE_SIZE 32 /* MN103E010 has 16 bytes. */
#	elif defined __hppa || defined __hppa__
/* Hewlett-Packard PA-RISC */
#		define _CACHE_LINE_SIZE 64 /* PA-RISC 2.0 uses 64 bytes, PA-RISC 1.1 uses 32. */
#	elif defined __powerpc || defined _ARCH_PPC
/* Power Architecture (a.k.a. PowerPC) */
#		define _CACHE_LINE_SIZE (1 << 7) /* Linux has a list of PPC models with associated L1_CACHE_SHIFT values. */
#	elif defined __s390__ || defined __370__ || defined __zarch__ || defined __SYSC_ZARCH__
/* IBM System/390 */
#		define _CACHE_LINE_SIZE 256
#	elif defined SUNPLUS || defined __SCORE__ || defined __score__
/* Sunplus S+core */
#		define _CACHE_LINE_SIZE (1 << 4)
#	elif defined __sh__
/* Hitachi SuperH */
#		define _CACHE_LINE_SIZE (1 << 5) /* SH3 and earlier used 1 << 4. */
#	elif defined __sparc__ || defined __sparc
/* SPARC */
#		define _CACHE_LINE_SIZE (1 << 7) /* Linux and FreeBSD disagree as to what this should be. */
#	elif defined __tile__
/* Tilera TILE series */
#		define _CACHE_LINE_SIZE (1 << 6) /* TILEPro uses different sizes for L1 and L2. */
#	elif defined __i386 || defined __x86_64
/* x86(-64) */
#		define _CACHE_LINE_SIZE (1 << 7)
#	elif defined __xtensa__ || defined __XTENSA__
/* Cadence Design Systems/Tensilica Xtensa */
#		define _CACHE_LINE_SIZE (1 << 5) /* 1 << 4 on some models. */
#	endif
#endif /* !defined _CACHE_LINE_SIZE */

#if defined __NetBSD__ && !defined _CACHE_LINE_SIZE
/*
NetBSD defines COHERENCY_UNIT to be 32 on MIPS, and 64 for all other platforms -- which is wrong. Still, this is what the kernel
uses; if this value didn't work, the system wouldn't run.
*/
#	include <sys/param.h>
#		ifdef COHERENCY_UNIT
#		define _CACHE_LINE_SIZE COHERENCY_UNIT
#	endif
#endif

#ifndef _CACHE_LINE_SIZE
#	define _CACHE_LINE_SIZE 256 /* Fallback cache line size. */
#endif

static unsigned _get_cache_line_size(void)
{
	/*
	The general idea:
	- Try to get the actual cache line size from the operating system.
	  - In the interest of keeping things simple, this only checks with
        glibc and OS X.
	    - A few other methods that could be added:
	      - Query x86 CPUs directly with the CPUID instruction.
	      - Query various ELF systems through the auxillary vector.
            (Power, Alpha, SuperH)
	      - Query Linux through
            /sys/devices/system/cpu/cpu?/cache/index?/coherency_line_size
            (x86 only, AFAIK)
	      - Query Linux through cache_alignment in /proc/cpuinfo
	      - Query Solaris through PICL.
	- If that fails, return a value appropriate for the current CPU
      architecture.
	- Otherwise, return a sufficiently large number.
	*/

	/*
	sysconf(3) is not a syscall, it's a glibc call that, for cache line sizes,
	uses CPUID on x86 and returns 0 on other platforms. If it were to work on
	most other platforms, it would have to get cache information from the
	kernel, since that information is usually made available by the processor
	only in privileged mode.
	https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/i386/sysconf.c;hb=HEAD
	*/

	/* uClibc, newlib, dietlibc, musl, Bionic do not have this. */

#	if HAVE_UNISTD_H && ( \
	defined _SC_LEVEL1_DCACHE_LINESIZE || \
	defined _SC_LEVEL2_CACHE_LINESIZE || \
	defined _SC_LEVEL3_CACHE_LINESIZE || \
	defined _SC_LEVEL4_CACHE_LINESIZE)
	{
		static const int names[] =
		{
#		ifdef _SC_LEVEL1_DCACHE_LINESIZE
			_SC_LEVEL1_DCACHE_LINESIZE,
#		endif
#		ifdef _SC_LEVEL2_CACHE_LINESIZE
			_SC_LEVEL2_CACHE_LINESIZE,
#		endif
#		ifdef _SC_LEVEL3_CACHE_LINESIZE
			_SC_LEVEL3_CACHE_LINESIZE,
#		endif
#		ifdef  _SC_LEVEL4_CACHE_LINESIZE
			_SC_LEVEL4_CACHE_LINESIZE
#		endif
		};

		const int *name;
		long result = 0;

		for(name = names; name != arrayend(names); ++name)
		{
			long sysconf_result = sysconf(*name); /* Can return -1 or 0 on
                                                     failure. */

			if(sysconf_result > result)
				result = sysconf_result;
		}

		if(result)
			return result;

		/* Currently, this fails for every platform that isn't x86. Perhaps
           future versions will support other processors? */
	}
#	endif

#	if defined __MACH__ && defined __APPLE__
	{
		uint32_t result; /* sysctl.h says that hw.cachelinesize is a
                            CTLTYPE_INT. */
		size_t size = sizeof(result);
		static const int name[] = {CTL_HW, HW_CACHELINE};

		if(!sysctl((int *)name, 2, &result, &size, NULL, 0)) /* (int *) is for OS X. */
		{
			assert(size == sizeof(result));
			return result;
		};
	}
#	endif

	/* Guess based on the CPU type. */
	return _CACHE_LINE_SIZE;
}

unsigned get_cache_line_size(void)
{
	unsigned result = _get_cache_line_size();
	assert(result >= sizeof(void *));
	assert(IS_POWER_OF_2(result));
	return result;
}

/* aligned_alloc() (C11) or posix_memalign() (POSIX) are other possibilities
   for aligned_malloc().
 */

int aligned_malloc(void **ptr, unsigned alignment, size_t size)
{
	void *block_start;
	ptrdiff_t align1;

    if (alignment == 0)
      alignment = aligned_malloc_default_alignment;

	assert(alignment && !(alignment & (alignment - 1))); /* alignment must be a power of two. */

    align1 = alignment - 1;

	size += sizeof(void *) + align1;
	block_start = malloc(size);
	if(!block_start)
		return ENOMEM;
	*ptr = (void *)(((ptrdiff_t)block_start + sizeof(void *) + align1) & ~align1);
	((void **)(*ptr))[-1] = block_start;
	return 0;
}

void aligned_free(void *ptr)
{
	if(ptr)
		free(((void **)(ptr))[-1]);
}