# upstream trunk r1907541, r1907637, r1907642, r1907677, r1907678
--- apr.orig/atomic/unix/mutex64.c
+++ apr/atomic/unix/mutex64.c
@@ -96,7 +96,14 @@ apr_status_t apr__atomic_generic64_init(
 
 APR_DECLARE(apr_uint64_t) apr_atomic_read64(volatile apr_uint64_t *mem)
 {
-    return *mem;
+    apr_uint64_t cur_value;
+    DECLARE_MUTEX_LOCKED(mutex, mem);
+
+    cur_value = *mem;
+
+    MUTEX_UNLOCK(mutex);
+
+    return cur_value;
 }
 
 APR_DECLARE(void) apr_atomic_set64(volatile apr_uint64_t *mem, apr_uint64_t val)
--- apr.orig/atomic/unix/builtins.c
+++ apr/atomic/unix/builtins.c
@@ -18,10 +18,11 @@
 
 #ifdef USE_ATOMICS_BUILTINS
 
-#if defined(__arm__) || defined(__powerpc__) || defined(__powerpc64__)
-#define WEAK_MEMORY_ORDERING 1
-#else
+#if defined(__i386__) || defined(__x86_64__) \
+    || defined(__s390__) || defined(__s390x__)
 #define WEAK_MEMORY_ORDERING 0
+#else
+#define WEAK_MEMORY_ORDERING 1
 #endif
 
 APR_DECLARE(apr_status_t) apr_atomic_init(apr_pool_t *p)
--- apr.orig/atomic/unix/builtins64.c
+++ apr/atomic/unix/builtins64.c
@@ -18,17 +18,18 @@
 
 #ifdef USE_ATOMICS_BUILTINS64
 
-#if defined(__arm__) || defined(__powerpc__) || defined(__powerpc64__)
-#define WEAK_MEMORY_ORDERING 1
-#else
+#if defined(__i386__) || defined(__x86_64__) \
+    || defined(__s390__) || defined(__s390x__)
 #define WEAK_MEMORY_ORDERING 0
+#else
+#define WEAK_MEMORY_ORDERING 1
 #endif
 
 APR_DECLARE(apr_uint64_t) apr_atomic_read64(volatile apr_uint64_t *mem)
 {
 #if HAVE__ATOMIC_BUILTINS64
     return __atomic_load_n(mem, __ATOMIC_SEQ_CST);
-#elif WEAK_MEMORY_ORDERING
+#elif WEAK_MEMORY_ORDERING || APR_SIZEOF_VOIDP < 8
     /* No __sync_load() available => apr_atomic_add64(mem, 0) */
     return __sync_fetch_and_add(mem, 0);
 #else
@@ -40,7 +41,7 @@ APR_DECLARE(void) apr_atomic_set64(volat
 {
 #if HAVE__ATOMIC_BUILTINS64
     __atomic_store_n(mem, val, __ATOMIC_SEQ_CST);
-#elif WEAK_MEMORY_ORDERING
+#elif WEAK_MEMORY_ORDERING || APR_SIZEOF_VOIDP < 8
     /* No __sync_store() available => apr_atomic_xchg64(mem, val) */
     __sync_synchronize();
     __sync_lock_test_and_set(mem, val);
--- apr.orig/configure.in
+++ apr/configure.in
@@ -552,31 +552,35 @@ AC_CACHE_CHECK([whether the compiler pro
 [AC_TRY_RUN([
 #if HAVE_STDINT_H
 #include <stdint.h>
+typedef uint64_t u64_t;
+#else
+typedef unsigned long long u64_t;
 #endif
 int main(int argc, const char *const *argv)
 {
-#if HAVE_STDINT_H
-    uint64_t val = 1010, tmp, *mem = &val;
-#else
-    unsigned long long val = 1010, tmp, *mem = &val;
-#endif
+    struct {
+        char pad0;
+        u64_t val;
+    } s;
+    u64_t *mem = &s.val, tmp;
 
-    if (__sync_fetch_and_add(&val, 1010) != 1010 || val != 2020)
+    s.val = 1010;
+    if (__sync_fetch_and_add(&s.val, 1010) != 1010 || s.val != 2020)
         return 1;
 
-    tmp = val;
-    if (__sync_fetch_and_sub(mem, 1010) != tmp || val != 1010)
+    tmp = s.val;
+    if (__sync_fetch_and_sub(mem, 1010) != tmp || s.val != 1010)
         return 1;
 
-    if (__sync_sub_and_fetch(&val, 1010) != 0 || val != 0)
+    if (__sync_sub_and_fetch(&s.val, 1010) != 0 || s.val != 0)
         return 1;
 
     tmp = 3030;
-    if (__sync_val_compare_and_swap(mem, 0, tmp) != 0 || val != tmp)
+    if (__sync_val_compare_and_swap(mem, 0, tmp) != 0 || s.val != tmp)
         return 1;
 
     __sync_synchronize();
-    if (__sync_lock_test_and_set(&val, 4040) != 3030)
+    if (__sync_lock_test_and_set(&s.val, 4040) != 3030)
         return 1;
 
     return 0;
@@ -586,31 +590,45 @@ AC_CACHE_CHECK([whether the compiler pro
 [AC_TRY_RUN([
 #if HAVE_STDINT_H
 #include <stdint.h>
+typedef uint64_t u64_t;
+#else
+typedef unsigned long long u64_t;
 #endif
+static int test_always_lock_free(volatile u64_t *val)
+{
+    return __atomic_always_lock_free(sizeof(*val), val);
+}
 int main(int argc, const char *const *argv)
 {
-#if HAVE_STDINT_H
-    uint64_t val = 1010, tmp, *mem = &val;
-#else
-    unsigned long long val = 1010, tmp, *mem = &val;
-#endif
+    struct {
+        char pad0;
+        u64_t val;
+        char pad1;
+        u64_t tmp;
+    } s;
+    u64_t *mem = &s.val;
+
+    /* check if alignment matters (no fallback to libatomic) */
+    if (!test_always_lock_free(&s.val))
+        return 1;
 
-    if (__atomic_fetch_add(&val, 1010, __ATOMIC_SEQ_CST) != 1010 || val != 2020)
+    s.val = 1010;
+    if (__atomic_fetch_add(&s.val, 1010, __ATOMIC_SEQ_CST) != 1010 || s.val != 2020)
         return 1;
 
-    tmp = val;
-    if (__atomic_fetch_sub(mem, 1010, __ATOMIC_SEQ_CST) != tmp || val != 1010)
+    s.tmp = s.val;
+    if (__atomic_fetch_sub(mem, 1010, __ATOMIC_SEQ_CST) != s.tmp || s.val != 1010)
         return 1;
 
-    if (__atomic_sub_fetch(&val, 1010, __ATOMIC_SEQ_CST) != 0 || val != 0)
+    if (__atomic_sub_fetch(&s.val, 1010, __ATOMIC_SEQ_CST) != 0 || s.val != 0)
         return 1;
 
-    tmp = val;
-    if (!__atomic_compare_exchange_n(mem, &tmp, 3030, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
-            || tmp != 0)
+    s.tmp = s.val;
+    if (!__atomic_compare_exchange_n(mem, &s.tmp, 3030, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+            || s.tmp != 0)
         return 1;
 
-    if (__atomic_exchange_n(&val, 4040, __ATOMIC_SEQ_CST) != 3030)
+    if (__atomic_exchange_n(&s.val, 4040, __ATOMIC_SEQ_CST) != 3030)
         return 1;
 
     return 0;
--- apr.orig/test/testatomic.c
+++ apr/test/testatomic.c
@@ -662,6 +662,9 @@ static void test_atomics_threaded64(abts
     pthread_setconcurrency(8);
 #endif
 
+    mutex_locks64 = 0;
+    apr_atomic_set64(&atomic_ops64, 0);
+
     rv = apr_thread_mutex_create(&thread_lock64, APR_THREAD_MUTEX_DEFAULT, p);
     APR_ASSERT_SUCCESS(tc, "Could not create lock", rv);
 
