diff --git a/mak/COPY b/mak/COPY
index 4c1719a041..6086012578 100644
--- a/mak/COPY
+++ b/mak/COPY
@@ -21,6 +21,8 @@ COPY=\
 	$(IMPDIR)\core\time.d \
 	$(IMPDIR)\core\vararg.d \
 	\
+	$(IMPDIR)\core\experimental\memutils.d \
+    \
 	$(IMPDIR)\core\internal\abort.d \
 	$(IMPDIR)\core\internal\arrayop.d \
 	$(IMPDIR)\core\internal\convert.d \
diff --git a/mak/DOCS b/mak/DOCS
index fa49be8963..c5ea44bcc2 100644
--- a/mak/DOCS
+++ b/mak/DOCS
@@ -19,6 +19,8 @@ DOCS=\
 	$(DOCDIR)\core_gc_config.html \
 	$(DOCDIR)\core_gc_gcinterface.html \
 	$(DOCDIR)\core_gc_registry.html \
+    \
+	$(DOCDIR)\core_experimental_memutils.html \
 	\
 	$(DOCDIR)\core_stdc_assert_.html \
 	$(DOCDIR)\core_stdc_config.html \
diff --git a/mak/SRCS b/mak/SRCS
index 309ca0f8d4..9d9d897cb0 100644
--- a/mak/SRCS
+++ b/mak/SRCS
@@ -16,6 +16,8 @@ SRCS=\
 	src\core\thread.d \
 	src\core\time.d \
 	src\core\vararg.d \
+    \
+	src\core\experimental\memutils.d \
 	\
 	src\core\gc\config.d \
 	src\core\gc\gcinterface.d \
diff --git a/mak/WINDOWS b/mak/WINDOWS
index 8fc6f78e14..2d46889566 100644
--- a/mak/WINDOWS
+++ b/mak/WINDOWS
@@ -116,6 +116,9 @@ $(IMPDIR)\core\gc\gcinterface.d : src\core\gc\gcinterface.d
 $(IMPDIR)\core\gc\registry.d : src\core\gc\registry.d
 	copy $** $@
 
+$(IMPDIR)\core\experimental\memutils.d : src\core\experimental\memutils.d
+	copy $** $@
+
 $(IMPDIR)\core\internal\abort.d : src\core\internal\abort.d
 	copy $** $@
 
diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
new file mode 100644
index 0000000000..6b5735b9f1
--- /dev/null
+++ b/src/core/experimental/memutils.d
@@ -0,0 +1,258 @@
+/**
+ * Pure D replacement of the C Standard Library basic memory building blocks of string.h
+ * Source: $(DRUNTIMESRC core/experimental/memutils.d)
+ */
+module core.experimental.memutils;
+
+/**
+ * If T is an array, set all `dst`'s bytes
+ * (whose count is the length of the array times
+ * the size of the array element) to `val`.
+ * Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`.
+ * N.B.: Contrary to the C Standard Library memset(), this functions returns nothing.
+ *
+ * Params:
+ *  val = The byte with which we want to fill memory with.
+ *  dst = Memory Destination whose bytes are to be set to `val`.
+ */
+void memset(T)(ref T dst, const ubyte val) nothrow @nogc
+{
+    import core.internal.traits : isArray;
+    const uint v = cast(uint) val;
+    static if (isArray!T)
+    {
+        size_t n = dst.length * typeof(dst[0]).sizeof;
+        Dmemset(dst.ptr, v, n);
+    }
+    else
+    {
+        Dmemset(&dst, v, T.sizeof);
+    }
+}
+
+version (D_SIMD)
+{
+    import core.simd : float4;
+    enum useSIMD = true;
+}
+else version (LDC)
+{
+    // LDC always supports SIMD (but doesn't ever set D_SIMD) and
+    // the back-end uses the most appropriate size for every target.
+    import core.simd : float4;
+    enum useSIMD = true;
+}
+else version (GNU)
+{
+    import core.simd : float4;
+    // GNU does not support SIMD by default.
+    version (X86_64)
+    {
+        private enum isX86 = true;
+    }
+    else version (X86)
+    {
+        private enum isX86 = true;
+    }
+
+    static if (isX86 && __traits(compiles, int4))
+    {
+        enum useSIMD = true;
+    }
+    else
+    {
+        enum useSIMD = false;
+    }
+}
+
+version (useSIMD)
+{
+    /* SIMD implementation
+     */
+    private void Dmemset(void *d, const uint val, size_t n) nothrow @nogc
+    {
+        import core.simd : int4;
+        version (LDC)
+        {
+            import ldc.simd : loadUnaligned, storeUnaligned;
+            void store16i_sse(void *dest, int4 reg) nothrow @nogc
+            {
+                storeUnaligned!int4(reg, cast(int*) dest);
+            }
+        }
+        else version (DigitalMars)
+        {
+            import core.simd : void16, loadUnaligned, storeUnaligned;
+            void store16i_sse(void *dest, int4 reg) nothrow @nogc
+            {
+                storeUnaligned(cast(void16*) dest, reg);
+            }
+        }
+        else
+        {
+            import gcc.builtins;
+            import core.simd : ubyte16;
+            void store16i_sse(void *dest, int4 reg) nothrow @nogc
+            {
+                __builtin_ia32_storedqu(cast(char*) dest, cast(ubyte16) reg);
+            }
+        }
+        void store32i_sse(void *dest, int4 reg) nothrow @nogc
+        {
+            store16i_sse(dest, reg);
+            store16i_sse(dest+0x10, reg);
+        }
+        // NOTE(stefanos): I use the naive version, which in my benchmarks was slower
+        // than the previous classic switch. BUT. Using the switch had a significant
+        // drop in the rest of the sizes. It's not the branch that is responsible for the drop,
+        // but the fact that it's more difficult to optimize it as part of the rest of the code.
+        if (n < 32)
+        {
+            memsetNaive(d, val, n);
+            return;
+        }
+        void *temp = d + n - 0x10;                  // Used for the last 32 bytes
+        const uint v = val * 0x01010101;            // Broadcast c to all 4 bytes
+        // Broadcast v to all bytes.
+        auto xmm0 = int4(v);
+        ubyte rem = cast(ubyte) d & 15;              // Remainder from the previous 16-byte boundary.
+        // Store 16 bytes, from which some will possibly overlap on a future store.
+        // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned,
+        // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most
+        // 16, we store 16 bytes anyway.
+        store16i_sse(d, xmm0);
+        d += 16 - rem;
+        n -= 16 - rem;
+        // Move in blocks of 32.
+        if (n >= 32)
+        {
+            // Align to (previous) multiple of 32. That does something invisible to the code,
+            // but a good optimizer will avoid a `cmp` instruction inside the loop. With a
+            // multiple of 32, the end of the loop can be (if we assume that `n` is in RDX):
+            // sub RDX, 32;
+            // jge START_OF_THE_LOOP.
+            // Without that, it has to be:
+            // sub RDX, 32;
+            // cmp RDX, 32;
+            // jge START_OF_THE_LOOP
+            // NOTE, that we align on a _previous_ multiple (for 37, we will go to 32). That means
+            // we have somehow to compensate for that, which is done at the end of this function.
+            n &= -32;
+            do
+            {
+                store32i_sse(d, xmm0);
+                // NOTE(stefanos): I tried avoiding this operation on `d` by combining
+                // `d` and `n` in the above loop and going backwards. It was slower in my benchs.
+                d += 32;
+                n -= 32;
+            } while (n >= 32);
+        }
+        // Compensate for the last (at most) 32 bytes.
+        store32i_sse(temp-0x10, xmm0);
+    }
+}
+else
+{
+    /* Forward to simple implementation.
+     */
+    private void Dmemset(void *d, const uint val, size_t n) nothrow @nogc
+    {
+        memsetNaive(d, val, n);
+    }
+}
+
+/* Naive version for when there isn't any vector support (SIMD etc.).
+*/
+private void memsetNaive(void *dst, const uint val, size_t n) nothrow @nogc
+{
+    // NOTE(stefanos): DMD could not inline it.
+    void handleLT16Sizes(void *d, const ulong v, size_t n)
+    {
+        switch (n)
+        {
+            case 6:
+                *(cast(uint*) (d+2)) = cast(uint) v;
+                goto case 2;  // fall-through
+            case 2:
+                *(cast(ushort*) d) = cast(ushort) v;
+                return;
+
+            case 7:
+                *(cast(uint*) (d+3)) = cast(uint) v;
+                goto case 3;  // fall-through
+            case 3:
+                *(cast(ushort*) (d+1)) = cast(ushort) v;
+                goto case 1;  // fall-through
+            case 1:
+                *(cast(ubyte*) d) = cast(ubyte) v;
+                return;
+
+            case 4:
+                *(cast(uint*) d) = cast(uint) v;
+                return;
+            case 0:
+                return;
+
+            case 5:
+                *(cast(uint*) (d+1)) = cast(uint) v;
+                *(cast(ubyte*) d) = cast(ubyte) v;
+                return;
+            default:
+        }
+    }
+
+
+    const ulong v = cast(ulong) val * 0x0101010101010101;  // Broadcast c to all 8 bytes
+    if (n < 8)
+    {
+        handleLT16Sizes(dst, v, n);
+        return;
+    }
+    // NOTE(stefanos): Normally, we would have different alignment
+    // for 32-bit and 64-bit versions. For the sake of simplicity,
+    // we'll let the compiler do the work.
+    ubyte rem = cast(ubyte) dst & 7;
+    if (rem)
+    {  // Unaligned
+        // Move 8 bytes (which we will possibly overlap later).
+        *(cast(ulong*) dst) = v;
+        dst += 8 - rem;
+        n -= 8 - rem;
+    }
+    ulong *d = cast(ulong*) dst;
+    ulong temp = n / 8;
+    // Go in steps of 8 - the register size in x86_64.
+    for (size_t i = 0; i != temp; ++i)
+    {
+        *d = v;
+        ++d;
+        n -= 8;
+    }
+    dst = cast(void *) d;
+
+    handleLT16Sizes(dst, v, n);
+}
+
+
+/** Core features tests.
+  */
+unittest
+{
+    ubyte[3] a;
+    memset(a, 7);
+    assert(a[0] == 7);
+    assert(a[1] == 7);
+    assert(a[2] == 7);
+
+    real b;
+    memset(b, 9);
+    ubyte *p = cast(ubyte*) &b;
+    foreach (i; 0 .. b.sizeof)
+    {
+        assert(p[i] == 9);
+    }
+
+    // Verify that it does not crash on empty array.
+    ubyte[0] c;
+    memset(c, 9);
+}
diff --git a/src/core/internal/traits.d b/src/core/internal/traits.d
index bccf1ad356..aa331590ac 100644
--- a/src/core/internal/traits.d
+++ b/src/core/internal/traits.d
@@ -567,3 +567,117 @@ if (func.length == 1 /*&& isCallable!func*/)
     static assert(P_dglit.length == 1);
     static assert(is(P_dglit[0] == int));
 }
+
+// [For internal use]
+package template ModifyTypePreservingTQ(alias Modifier, T)
+{
+         static if (is(T U ==          immutable U)) alias ModifyTypePreservingTQ =          immutable Modifier!U;
+    else static if (is(T U == shared inout const U)) alias ModifyTypePreservingTQ = shared inout const Modifier!U;
+    else static if (is(T U == shared inout       U)) alias ModifyTypePreservingTQ = shared inout       Modifier!U;
+    else static if (is(T U == shared       const U)) alias ModifyTypePreservingTQ = shared       const Modifier!U;
+    else static if (is(T U == shared             U)) alias ModifyTypePreservingTQ = shared             Modifier!U;
+    else static if (is(T U ==        inout const U)) alias ModifyTypePreservingTQ =        inout const Modifier!U;
+    else static if (is(T U ==        inout       U)) alias ModifyTypePreservingTQ =              inout Modifier!U;
+    else static if (is(T U ==              const U)) alias ModifyTypePreservingTQ =              const Modifier!U;
+    else                                             alias ModifyTypePreservingTQ =                    Modifier!T;
+}
+
+@safe unittest
+{
+    alias Intify(T) = int;
+    static assert(is(ModifyTypePreservingTQ!(Intify,                    real) ==                    int));
+    static assert(is(ModifyTypePreservingTQ!(Intify,              const real) ==              const int));
+    static assert(is(ModifyTypePreservingTQ!(Intify,        inout       real) ==        inout       int));
+    static assert(is(ModifyTypePreservingTQ!(Intify,        inout const real) ==        inout const int));
+    static assert(is(ModifyTypePreservingTQ!(Intify, shared             real) == shared             int));
+    static assert(is(ModifyTypePreservingTQ!(Intify, shared       const real) == shared       const int));
+    static assert(is(ModifyTypePreservingTQ!(Intify, shared inout       real) == shared inout       int));
+    static assert(is(ModifyTypePreservingTQ!(Intify, shared inout const real) == shared inout const int));
+    static assert(is(ModifyTypePreservingTQ!(Intify,          immutable real) ==          immutable int));
+}
+
+/**
+ * Strips off all `enum`s from type `T`.
+ */
+template OriginalType(T)
+{
+    template Impl(T)
+    {
+        static if (is(T U == enum)) alias Impl = OriginalType!U;
+        else                        alias Impl =              T;
+    }
+
+    alias OriginalType = ModifyTypePreservingTQ!(Impl, T);
+}
+
+///
+@safe unittest
+{
+    enum E : real { a = 0 } // NOTE: explicit initialization to 0 required during Enum init deprecation cycle
+    enum F : E    { a = E.a }
+    alias G = const(F);
+    static assert(is(OriginalType!E == real));
+    static assert(is(OriginalType!F == real));
+    static assert(is(OriginalType!G == const real));
+}
+
+/**
+ * Detect whether type `T` is an aggregate type.
+ */
+enum bool isAggregateType(T) = is(T == struct) || is(T == union) ||
+                               is(T == class) || is(T == interface);
+
+private template AliasThisTypeOf(T)
+if (isAggregateType!T)
+{
+    alias members = __traits(getAliasThis, T);
+
+    static if (members.length == 1)
+    {
+        alias AliasThisTypeOf = typeof(__traits(getMember, T.init, members[0]));
+    }
+    else
+        static assert(0, T.stringof~" does not have alias this type");
+}
+
+/*
+ */
+template DynamicArrayTypeOf(T)
+{
+    static if (is(AliasThisTypeOf!T AT) && !is(AT[] == AT))
+        alias X = DynamicArrayTypeOf!AT;
+    else
+        alias X = OriginalType!T;
+
+    static if (is(Unqual!X : E[], E) && !is(typeof({ enum n = X.length; })))
+    {
+        alias DynamicArrayTypeOf = X;
+    }
+    else
+        static assert(0, T.stringof~" is not a dynamic array");
+}
+
+// TODO(stefanos): More unit-testing.
+
+@safe unittest
+{
+    static assert(!is(DynamicArrayTypeOf!(int[3])));
+    static assert(!is(DynamicArrayTypeOf!(void[3])));
+    static assert(!is(DynamicArrayTypeOf!(typeof(null))));
+}
+
+/**
+ * Detect whether type `T` is a dynamic array.
+ */
+enum bool isDynamicArray(T) = is(DynamicArrayTypeOf!T) && !isAggregateType!T;
+
+/**
+ * Detect whether type `T` is an array (static or dynamic; for associative
+ *  arrays see $(LREF isAssociativeArray)).
+ */
+enum bool isArray(T) = isStaticArray!T || isDynamicArray!T;
+
+/**
+ * Detect whether type `T` is a static array.
+ */
+enum bool isStaticArray(T) = __traits(isStaticArray, T);
diff --git a/test/experimental/Makefile b/test/experimental/Makefile
new file mode 100644
index 0000000000..2dbbd68aae
--- /dev/null
+++ b/test/experimental/Makefile
@@ -0,0 +1,17 @@
+include ../common.mak
+
+TESTS:=memutils
+
+.PHONY: all clean
+all: $(addprefix $(ROOT)/,$(addsuffix .done,$(TESTS)))
+
+$(ROOT)/%.done: $(ROOT)/%
+	@echo Testing $*
+	$(QUIET)$(TIMELIMIT)$(ROOT)/$* $(RUN_ARGS)
+	@touch $@
+
+$(ROOT)/%: $(SRC)/%.d
+	$(QUIET)$(DMD) $(DFLAGS) -of$@ $<
+
+clean:
+	rm -rf $(ROOT)
diff --git a/test/experimental/src/memutils.d b/test/experimental/src/memutils.d
new file mode 100644
index 0000000000..f9ed626c67
--- /dev/null
+++ b/test/experimental/src/memutils.d
@@ -0,0 +1,108 @@
+import core.experimental.memutils : memset;
+
+void main()
+{
+    DmemsetTestStaticType!(byte)(5);
+    DmemsetTestStaticType!(ubyte)(5);
+    DmemsetTestStaticType!(short)(5);
+    DmemsetTestStaticType!(ushort)(5);
+    DmemsetTestStaticType!(int)(5);
+    DmemsetTestStaticType!(uint)(5);
+    DmemsetTestStaticType!(long)(5);
+    DmemsetTestStaticType!(ulong)(5);
+    DmemsetTestStaticType!(float)(5);
+    DmemsetTestStaticType!(double)(5);
+    DmemsetTestStaticType!(real)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 3);
+    static foreach (i; 1..10) {
+        DmemsetTestDynamicArray!(ubyte)(5, 2^^i);
+        DmemsetTestStaticArray!(ubyte, 2^^i)(5);
+    }
+    DmemsetTestDynamicArray!(ubyte)(5, 100);
+    DmemsetTestStaticArray!(ubyte, 100)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 500);
+    DmemsetTestStaticArray!(ubyte, 500)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 700);
+    DmemsetTestStaticArray!(ubyte, 700)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 3434);
+    DmemsetTestStaticArray!(ubyte, 3434)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 7128);
+    DmemsetTestStaticArray!(ubyte, 7128)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 13908);
+    DmemsetTestStaticArray!(ubyte, 13908)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 16343);
+    DmemsetTestStaticArray!(ubyte, 16343)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 27897);
+    DmemsetTestStaticArray!(ubyte, 27897)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 32344);
+    DmemsetTestStaticArray!(ubyte, 32344)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 46830);
+    DmemsetTestStaticArray!(ubyte, 46830)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 64349);
+    DmemsetTestStaticArray!(ubyte, 64349)(5);
+}
+
+void DmemsetVerifyArray(T)(int j, const ref T[] a, const ubyte v)
+{
+    const ubyte *p = cast(const ubyte *) a.ptr;
+    foreach (i; 0 .. (a.length * T.sizeof))
+    {
+        assert(p[i] == v);
+    }
+}
+
+void DmemsetVerifyStaticType(T)(const ref T t, const ubyte v)
+{
+    const ubyte *p = cast(const ubyte *) &t;
+    foreach (i; 0 .. T.sizeof)
+    {
+        assert(p[i] == v);
+    }
+}
+
+// NOTE(stefanos): Escaping the pointers is not needed, the compiler doesn't optimize it away.
+// My best guess is that this is because of the verification (i.e. if the operation is not done,
+// an assert will fire and does not satisfy correctness).
+
+void DmemsetTestDynamicArray(T)(const ubyte v, size_t n)
+{
+    T[] buf;
+    buf.length = n + 32;
+
+    enum alignments = 32;
+    size_t len = n;
+
+    foreach (i; 0 .. alignments)
+    {
+        auto d = buf[i..i+n];
+
+        escape(d.ptr);
+        memset(d, v);
+        DmemsetVerifyArray(i, d, v);
+    }
+}
+
+void DmemsetTestStaticArray(T, size_t n)(const ubyte v)
+{
+    T[n + 32] buf;
+
+    enum alignments = 32;
+    size_t len = n;
+
+    foreach (i; 0..alignments)
+    {
+        auto d = buf[i..i+n];
+
+        escape(d.ptr);
+        memset(d, v);
+        DmemsetVerifyArray(i, d, v);
+    }
+}
+
+void DmemsetTestStaticType(T)(const ubyte v)
+{
+    T t;
+    escape(&t);
+    memset(t, v);
+    DmemsetVerifyStaticType(t, v);
+}