diff --git a/mak/COPY b/mak/COPY index 4c1719a041..6086012578 100644 --- a/mak/COPY +++ b/mak/COPY @@ -21,6 +21,8 @@ COPY=\ $(IMPDIR)\core\time.d \ $(IMPDIR)\core\vararg.d \ \ + $(IMPDIR)\core\experimental\memutils.d \ + \ $(IMPDIR)\core\internal\abort.d \ $(IMPDIR)\core\internal\arrayop.d \ $(IMPDIR)\core\internal\convert.d \ diff --git a/mak/DOCS b/mak/DOCS index fa49be8963..c5ea44bcc2 100644 --- a/mak/DOCS +++ b/mak/DOCS @@ -19,6 +19,8 @@ DOCS=\ $(DOCDIR)\core_gc_config.html \ $(DOCDIR)\core_gc_gcinterface.html \ $(DOCDIR)\core_gc_registry.html \ + \ + $(DOCDIR)\core_experimental_memutils.html \ \ $(DOCDIR)\core_stdc_assert_.html \ $(DOCDIR)\core_stdc_config.html \ diff --git a/mak/SRCS b/mak/SRCS index 309ca0f8d4..9d9d897cb0 100644 --- a/mak/SRCS +++ b/mak/SRCS @@ -16,6 +16,8 @@ SRCS=\ src\core\thread.d \ src\core\time.d \ src\core\vararg.d \ + \ + src\core\experimental\memutils.d \ \ src\core\gc\config.d \ src\core\gc\gcinterface.d \ diff --git a/mak/WINDOWS b/mak/WINDOWS index 8fc6f78e14..2d46889566 100644 --- a/mak/WINDOWS +++ b/mak/WINDOWS @@ -116,6 +116,9 @@ $(IMPDIR)\core\gc\gcinterface.d : src\core\gc\gcinterface.d $(IMPDIR)\core\gc\registry.d : src\core\gc\registry.d copy $** $@ +$(IMPDIR)\core\experimental\memutils.d : src\core\experimental\memutils.d + copy $** $@ + $(IMPDIR)\core\internal\abort.d : src\core\internal\abort.d copy $** $@ diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d new file mode 100644 index 0000000000..6b5735b9f1 --- /dev/null +++ b/src/core/experimental/memutils.d @@ -0,0 +1,258 @@ +/** + * Pure D replacement of the C Standard Library basic memory building blocks of string.h + * Source: $(DRUNTIMESRC core/experimental/memutils.d) + */ +module core.experimental.memutils; + +/** + * If T is an array, set all `dst`'s bytes + * (whose count is the length of the array times + * the size of the array element) to `val`. + * Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`. + * N.B.: Contrary to the C Standard Library memset(), this functions returns nothing. + * + * Params: + * val = The byte with which we want to fill memory with. + * dst = Memory Destination whose bytes are to be set to `val`. + */ +void memset(T)(ref T dst, const ubyte val) nothrow @nogc +{ + import core.internal.traits : isArray; + const uint v = cast(uint) val; + static if (isArray!T) + { + size_t n = dst.length * typeof(dst[0]).sizeof; + Dmemset(dst.ptr, v, n); + } + else + { + Dmemset(&dst, v, T.sizeof); + } +} + +version (D_SIMD) +{ + import core.simd : float4; + enum useSIMD = true; +} +else version (LDC) +{ + // LDC always supports SIMD (but doesn't ever set D_SIMD) and + // the back-end uses the most appropriate size for every target. + import core.simd : float4; + enum useSIMD = true; +} +else version (GNU) +{ + import core.simd : float4; + // GNU does not support SIMD by default. + version (X86_64) + { + private enum isX86 = true; + } + else version (X86) + { + private enum isX86 = true; + } + + static if (isX86 && __traits(compiles, int4)) + { + enum useSIMD = true; + } + else + { + enum useSIMD = false; + } +} + +version (useSIMD) +{ + /* SIMD implementation + */ + private void Dmemset(void *d, const uint val, size_t n) nothrow @nogc + { + import core.simd : int4; + version (LDC) + { + import ldc.simd : loadUnaligned, storeUnaligned; + void store16i_sse(void *dest, int4 reg) nothrow @nogc + { + storeUnaligned!int4(reg, cast(int*) dest); + } + } + else version (DigitalMars) + { + import core.simd : void16, loadUnaligned, storeUnaligned; + void store16i_sse(void *dest, int4 reg) nothrow @nogc + { + storeUnaligned(cast(void16*) dest, reg); + } + } + else + { + import gcc.builtins; + import core.simd : ubyte16; + void store16i_sse(void *dest, int4 reg) nothrow @nogc + { + __builtin_ia32_storedqu(cast(char*) dest, cast(ubyte16) reg); + } + } + void store32i_sse(void *dest, int4 reg) nothrow @nogc + { + store16i_sse(dest, reg); + store16i_sse(dest+0x10, reg); + } + // NOTE(stefanos): I use the naive version, which in my benchmarks was slower + // than the previous classic switch. BUT. Using the switch had a significant + // drop in the rest of the sizes. It's not the branch that is responsible for the drop, + // but the fact that it's more difficult to optimize it as part of the rest of the code. + if (n < 32) + { + memsetNaive(d, val, n); + return; + } + void *temp = d + n - 0x10; // Used for the last 32 bytes + const uint v = val * 0x01010101; // Broadcast c to all 4 bytes + // Broadcast v to all bytes. + auto xmm0 = int4(v); + ubyte rem = cast(ubyte) d & 15; // Remainder from the previous 16-byte boundary. + // Store 16 bytes, from which some will possibly overlap on a future store. + // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned, + // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most + // 16, we store 16 bytes anyway. + store16i_sse(d, xmm0); + d += 16 - rem; + n -= 16 - rem; + // Move in blocks of 32. + if (n >= 32) + { + // Align to (previous) multiple of 32. That does something invisible to the code, + // but a good optimizer will avoid a `cmp` instruction inside the loop. With a + // multiple of 32, the end of the loop can be (if we assume that `n` is in RDX): + // sub RDX, 32; + // jge START_OF_THE_LOOP. + // Without that, it has to be: + // sub RDX, 32; + // cmp RDX, 32; + // jge START_OF_THE_LOOP + // NOTE, that we align on a _previous_ multiple (for 37, we will go to 32). That means + // we have somehow to compensate for that, which is done at the end of this function. + n &= -32; + do + { + store32i_sse(d, xmm0); + // NOTE(stefanos): I tried avoiding this operation on `d` by combining + // `d` and `n` in the above loop and going backwards. It was slower in my benchs. + d += 32; + n -= 32; + } while (n >= 32); + } + // Compensate for the last (at most) 32 bytes. + store32i_sse(temp-0x10, xmm0); + } +} +else +{ + /* Forward to simple implementation. + */ + private void Dmemset(void *d, const uint val, size_t n) nothrow @nogc + { + memsetNaive(d, val, n); + } +} + +/* Naive version for when there isn't any vector support (SIMD etc.). +*/ +private void memsetNaive(void *dst, const uint val, size_t n) nothrow @nogc +{ + // NOTE(stefanos): DMD could not inline it. + void handleLT16Sizes(void *d, const ulong v, size_t n) + { + switch (n) + { + case 6: + *(cast(uint*) (d+2)) = cast(uint) v; + goto case 2; // fall-through + case 2: + *(cast(ushort*) d) = cast(ushort) v; + return; + + case 7: + *(cast(uint*) (d+3)) = cast(uint) v; + goto case 3; // fall-through + case 3: + *(cast(ushort*) (d+1)) = cast(ushort) v; + goto case 1; // fall-through + case 1: + *(cast(ubyte*) d) = cast(ubyte) v; + return; + + case 4: + *(cast(uint*) d) = cast(uint) v; + return; + case 0: + return; + + case 5: + *(cast(uint*) (d+1)) = cast(uint) v; + *(cast(ubyte*) d) = cast(ubyte) v; + return; + default: + } + } + + + const ulong v = cast(ulong) val * 0x0101010101010101; // Broadcast c to all 8 bytes + if (n < 8) + { + handleLT16Sizes(dst, v, n); + return; + } + // NOTE(stefanos): Normally, we would have different alignment + // for 32-bit and 64-bit versions. For the sake of simplicity, + // we'll let the compiler do the work. + ubyte rem = cast(ubyte) dst & 7; + if (rem) + { // Unaligned + // Move 8 bytes (which we will possibly overlap later). + *(cast(ulong*) dst) = v; + dst += 8 - rem; + n -= 8 - rem; + } + ulong *d = cast(ulong*) dst; + ulong temp = n / 8; + // Go in steps of 8 - the register size in x86_64. + for (size_t i = 0; i != temp; ++i) + { + *d = v; + ++d; + n -= 8; + } + dst = cast(void *) d; + + handleLT16Sizes(dst, v, n); +} + + +/** Core features tests. + */ +unittest +{ + ubyte[3] a; + memset(a, 7); + assert(a[0] == 7); + assert(a[1] == 7); + assert(a[2] == 7); + + real b; + memset(b, 9); + ubyte *p = cast(ubyte*) &b; + foreach (i; 0 .. b.sizeof) + { + assert(p[i] == 9); + } + + // Verify that it does not crash on empty array. + ubyte[0] c; + memset(c, 9); +} diff --git a/src/core/internal/traits.d b/src/core/internal/traits.d index bccf1ad356..aa331590ac 100644 --- a/src/core/internal/traits.d +++ b/src/core/internal/traits.d @@ -567,3 +567,117 @@ if (func.length == 1 /*&& isCallable!func*/) static assert(P_dglit.length == 1); static assert(is(P_dglit[0] == int)); } + +// [For internal use] +package template ModifyTypePreservingTQ(alias Modifier, T) +{ + static if (is(T U == immutable U)) alias ModifyTypePreservingTQ = immutable Modifier!U; + else static if (is(T U == shared inout const U)) alias ModifyTypePreservingTQ = shared inout const Modifier!U; + else static if (is(T U == shared inout U)) alias ModifyTypePreservingTQ = shared inout Modifier!U; + else static if (is(T U == shared const U)) alias ModifyTypePreservingTQ = shared const Modifier!U; + else static if (is(T U == shared U)) alias ModifyTypePreservingTQ = shared Modifier!U; + else static if (is(T U == inout const U)) alias ModifyTypePreservingTQ = inout const Modifier!U; + else static if (is(T U == inout U)) alias ModifyTypePreservingTQ = inout Modifier!U; + else static if (is(T U == const U)) alias ModifyTypePreservingTQ = const Modifier!U; + else alias ModifyTypePreservingTQ = Modifier!T; +} + +@safe unittest +{ + alias Intify(T) = int; + static assert(is(ModifyTypePreservingTQ!(Intify, real) == int)); + static assert(is(ModifyTypePreservingTQ!(Intify, const real) == const int)); + static assert(is(ModifyTypePreservingTQ!(Intify, inout real) == inout int)); + static assert(is(ModifyTypePreservingTQ!(Intify, inout const real) == inout const int)); + static assert(is(ModifyTypePreservingTQ!(Intify, shared real) == shared int)); + static assert(is(ModifyTypePreservingTQ!(Intify, shared const real) == shared const int)); + static assert(is(ModifyTypePreservingTQ!(Intify, shared inout real) == shared inout int)); + static assert(is(ModifyTypePreservingTQ!(Intify, shared inout const real) == shared inout const int)); + static assert(is(ModifyTypePreservingTQ!(Intify, immutable real) == immutable int)); +} + +/** + * Strips off all `enum`s from type `T`. + */ +template OriginalType(T) +{ + template Impl(T) + { + static if (is(T U == enum)) alias Impl = OriginalType!U; + else alias Impl = T; + } + + alias OriginalType = ModifyTypePreservingTQ!(Impl, T); +} + +/// +@safe unittest +{ + enum E : real { a = 0 } // NOTE: explicit initialization to 0 required during Enum init deprecation cycle + enum F : E { a = E.a } + alias G = const(F); + static assert(is(OriginalType!E == real)); + static assert(is(OriginalType!F == real)); + static assert(is(OriginalType!G == const real)); +} + +/** + * Detect whether type `T` is an aggregate type. + */ +enum bool isAggregateType(T) = is(T == struct) || is(T == union) || + is(T == class) || is(T == interface); + +private template AliasThisTypeOf(T) +if (isAggregateType!T) +{ + alias members = __traits(getAliasThis, T); + + static if (members.length == 1) + { + alias AliasThisTypeOf = typeof(__traits(getMember, T.init, members[0])); + } + else + static assert(0, T.stringof~" does not have alias this type"); +} + +/* + */ +template DynamicArrayTypeOf(T) +{ + static if (is(AliasThisTypeOf!T AT) && !is(AT[] == AT)) + alias X = DynamicArrayTypeOf!AT; + else + alias X = OriginalType!T; + + static if (is(Unqual!X : E[], E) && !is(typeof({ enum n = X.length; }))) + { + alias DynamicArrayTypeOf = X; + } + else + static assert(0, T.stringof~" is not a dynamic array"); +} + +// TODO(stefanos): More unit-testing. + +@safe unittest +{ + static assert(!is(DynamicArrayTypeOf!(int[3]))); + static assert(!is(DynamicArrayTypeOf!(void[3]))); + static assert(!is(DynamicArrayTypeOf!(typeof(null)))); +} + +/** + * Detect whether type `T` is a dynamic array. + */ +enum bool isDynamicArray(T) = is(DynamicArrayTypeOf!T) && !isAggregateType!T; + +/** + * Detect whether type `T` is an array (static or dynamic; for associative + * arrays see $(LREF isAssociativeArray)). + */ +enum bool isArray(T) = isStaticArray!T || isDynamicArray!T; + +/** + * Detect whether type `T` is a static array. + */ +enum bool isStaticArray(T) = __traits(isStaticArray, T); diff --git a/test/experimental/Makefile b/test/experimental/Makefile new file mode 100644 index 0000000000..2dbbd68aae --- /dev/null +++ b/test/experimental/Makefile @@ -0,0 +1,17 @@ +include ../common.mak + +TESTS:=memutils + +.PHONY: all clean +all: $(addprefix $(ROOT)/,$(addsuffix .done,$(TESTS))) + +$(ROOT)/%.done: $(ROOT)/% + @echo Testing $* + $(QUIET)$(TIMELIMIT)$(ROOT)/$* $(RUN_ARGS) + @touch $@ + +$(ROOT)/%: $(SRC)/%.d + $(QUIET)$(DMD) $(DFLAGS) -of$@ $< + +clean: + rm -rf $(ROOT) diff --git a/test/experimental/src/memutils.d b/test/experimental/src/memutils.d new file mode 100644 index 0000000000..f9ed626c67 --- /dev/null +++ b/test/experimental/src/memutils.d @@ -0,0 +1,108 @@ +import core.experimental.memutils : memset; + +void main() +{ + DmemsetTestStaticType!(byte)(5); + DmemsetTestStaticType!(ubyte)(5); + DmemsetTestStaticType!(short)(5); + DmemsetTestStaticType!(ushort)(5); + DmemsetTestStaticType!(int)(5); + DmemsetTestStaticType!(uint)(5); + DmemsetTestStaticType!(long)(5); + DmemsetTestStaticType!(ulong)(5); + DmemsetTestStaticType!(float)(5); + DmemsetTestStaticType!(double)(5); + DmemsetTestStaticType!(real)(5); + DmemsetTestDynamicArray!(ubyte)(5, 3); + static foreach (i; 1..10) { + DmemsetTestDynamicArray!(ubyte)(5, 2^^i); + DmemsetTestStaticArray!(ubyte, 2^^i)(5); + } + DmemsetTestDynamicArray!(ubyte)(5, 100); + DmemsetTestStaticArray!(ubyte, 100)(5); + DmemsetTestDynamicArray!(ubyte)(5, 500); + DmemsetTestStaticArray!(ubyte, 500)(5); + DmemsetTestDynamicArray!(ubyte)(5, 700); + DmemsetTestStaticArray!(ubyte, 700)(5); + DmemsetTestDynamicArray!(ubyte)(5, 3434); + DmemsetTestStaticArray!(ubyte, 3434)(5); + DmemsetTestDynamicArray!(ubyte)(5, 7128); + DmemsetTestStaticArray!(ubyte, 7128)(5); + DmemsetTestDynamicArray!(ubyte)(5, 13908); + DmemsetTestStaticArray!(ubyte, 13908)(5); + DmemsetTestDynamicArray!(ubyte)(5, 16343); + DmemsetTestStaticArray!(ubyte, 16343)(5); + DmemsetTestDynamicArray!(ubyte)(5, 27897); + DmemsetTestStaticArray!(ubyte, 27897)(5); + DmemsetTestDynamicArray!(ubyte)(5, 32344); + DmemsetTestStaticArray!(ubyte, 32344)(5); + DmemsetTestDynamicArray!(ubyte)(5, 46830); + DmemsetTestStaticArray!(ubyte, 46830)(5); + DmemsetTestDynamicArray!(ubyte)(5, 64349); + DmemsetTestStaticArray!(ubyte, 64349)(5); +} + +void DmemsetVerifyArray(T)(int j, const ref T[] a, const ubyte v) +{ + const ubyte *p = cast(const ubyte *) a.ptr; + foreach (i; 0 .. (a.length * T.sizeof)) + { + assert(p[i] == v); + } +} + +void DmemsetVerifyStaticType(T)(const ref T t, const ubyte v) +{ + const ubyte *p = cast(const ubyte *) &t; + foreach (i; 0 .. T.sizeof) + { + assert(p[i] == v); + } +} + +// NOTE(stefanos): Escaping the pointers is not needed, the compiler doesn't optimize it away. +// My best guess is that this is because of the verification (i.e. if the operation is not done, +// an assert will fire and does not satisfy correctness). + +void DmemsetTestDynamicArray(T)(const ubyte v, size_t n) +{ + T[] buf; + buf.length = n + 32; + + enum alignments = 32; + size_t len = n; + + foreach (i; 0 .. alignments) + { + auto d = buf[i..i+n]; + + escape(d.ptr); + memset(d, v); + DmemsetVerifyArray(i, d, v); + } +} + +void DmemsetTestStaticArray(T, size_t n)(const ubyte v) +{ + T[n + 32] buf; + + enum alignments = 32; + size_t len = n; + + foreach (i; 0..alignments) + { + auto d = buf[i..i+n]; + + escape(d.ptr); + memset(d, v); + DmemsetVerifyArray(i, d, v); + } +} + +void DmemsetTestStaticType(T)(const ubyte v) +{ + T t; + escape(&t); + memset(t, v); + DmemsetVerifyStaticType(t, v); +}