diff --git a/include/cutils/atomic-inline.h b/include/cutils/atomic-inline.h
new file mode 100644
index 000000000..4f5ddf761
--- /dev/null
+++ b/include/cutils/atomic-inline.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_CUTILS_ATOMIC_INLINE_H
+#define ANDROID_CUTILS_ATOMIC_INLINE_H
+
+/*
+ * Inline declarations and macros for some special-purpose atomic
+ * operations.  These are intended for rare circumstances where a
+ * memory barrier needs to be issued inline rather than as a function
+ * call.
+ *
+ * Most code should not use these.
+ *
+ * Anything that does include this file must set ANDROID_SMP to either
+ * 0 or 1, indicating compilation for UP or SMP, respectively.
+ */
+
+#if !defined(ANDROID_SMP)
+# error "Must define ANDROID_SMP before including atomic-inline.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Define the full memory barrier for an SMP system.  This is
+ * platform-specific.
+ */
+
+#ifdef __arm__
+#include <machine/cpu-features.h>
+
+/*
+ * For ARMv6K we need to issue a specific MCR instead of the DMB, since
+ * that wasn't added until v7.  For anything older, SMP isn't relevant.
+ * Since we don't have an ARMv6K to test with, we're not going to deal
+ * with that now.
+ *
+ * The DMB instruction is found in the ARM and Thumb2 instruction sets.
+ * This will fail on plain 16-bit Thumb.
+ */
+#if defined(__ARM_HAVE_DMB)
+# define __android_membar_full_smp() \
+    do { __asm__ __volatile__ ("dmb" ::: "memory"); } while (0)
+#else
+# define __android_membar_full_smp()  ARM_SMP_defined_but_no_DMB()
+#endif
+
+#elif defined(__i386__) || defined(__x86_64__)
+/*
+ * For recent x86, we can use the SSE2 mfence instruction.
+ */
+# define __android_membar_full_smp() \
+    do { __asm__ __volatile__ ("mfence" ::: "memory"); } while (0)
+
+#else
+/*
+ * Implementation not defined for this platform.  Hopefully we're building
+ * in uniprocessor mode.
+ */
+# define __android_membar_full_smp()  SMP_barrier_not_defined_for_platform()
+#endif
+
+
+/*
+ * Full barrier.  On uniprocessors this is just a compiler reorder barrier,
+ * which ensures that the statements appearing above the barrier in the C/C++
+ * code will be issued after the statements appearing below the barrier.
+ *
+ * For SMP this also includes a memory barrier instruction.  On an ARM
+ * CPU this means that the current core will flush pending writes, wait
+ * for pending reads to complete, and discard any cached reads that could
+ * be stale.  Other CPUs may do less, but the end result is equivalent.
+ */
+#if ANDROID_SMP != 0
+# define android_membar_full() __android_membar_full_smp()
+#else
+# define android_membar_full() \
+    do { __asm__ __volatile__ ("" ::: "memory"); } while (0)
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // ANDROID_CUTILS_ATOMIC_INLINE_H
diff --git a/include/cutils/atomic.h b/include/cutils/atomic.h
index 5694d66ac..8e12902b0 100644
--- a/include/cutils/atomic.h
+++ b/include/cutils/atomic.h
@@ -25,10 +25,8 @@ extern "C" {
 #endif
 
 /*
- * NOTE: memory shared between threads is synchronized by all atomic operations
- * below, this means that no explicit memory barrier is required: all reads or 
- * writes issued before android_atomic_* operations are guaranteed to complete
- * before the atomic operation takes place.
+ * Unless otherwise noted, the operations below perform a full fence before
+ * the atomic operation on SMP systems ("release" semantics).
  */
 
 void android_atomic_write(int32_t value, volatile int32_t* addr);
@@ -37,7 +35,6 @@ void android_atomic_write(int32_t value, volatile int32_t* addr);
  * all these atomic operations return the previous value
  */
 
-
 int32_t android_atomic_inc(volatile int32_t* addr);
 int32_t android_atomic_dec(volatile int32_t* addr);
 
@@ -48,30 +45,32 @@ int32_t android_atomic_or(int32_t value, volatile int32_t* addr);
 int32_t android_atomic_swap(int32_t value, volatile int32_t* addr);
 
 /*
- * NOTE: Two "quasiatomic" operations on the exact same memory address
- * are guaranteed to operate atomically with respect to each other,
- * but no guarantees are made about quasiatomic operations mixed with
- * non-quasiatomic operations on the same address, nor about
- * quasiatomic operations that are performed on partially-overlapping
- * memory.
+ * cmpxchg returns zero if the new value was successfully written.  This
+ * will only happen when *addr == oldvalue.
+ *
+ * (The return value is inverted from implementations on other platforms, but
+ * matches the ARM ldrex/strex sematics.  Note also this is a compare-and-set
+ * operation, not a compare-and-exchange operation, since we don't return
+ * the original value.)
  */
-
-int64_t android_quasiatomic_swap_64(int64_t value, volatile int64_t* addr);
-int64_t android_quasiatomic_read_64(volatile int64_t* addr);
-    
-/*
- * cmpxchg return a non zero value if the exchange was NOT performed,
- * in other words if oldvalue != *addr
- */
-
 int android_atomic_cmpxchg(int32_t oldvalue, int32_t newvalue,
         volatile int32_t* addr);
 
-int android_quasiatomic_cmpxchg_64(int64_t oldvalue, int64_t newvalue,
-        volatile int64_t* addr);
+/*
+ * Same basic operation as android_atomic_cmpxchg, but with "acquire"
+ * semantics.  The memory barrier, if required, is performed after the
+ * new value is stored.  Useful for acquiring a spin lock.
+ */
+int android_atomic_acquire_cmpxchg(int32_t oldvalue, int32_t newvalue,
+        volatile int32_t* addr);
 
+/*
+ * Perform an atomic store with "release" semantics.  The memory barrier,
+ * if required, is performed before the store instruction.  Useful for
+ * releasing a spin lock.
+ */
+#define android_atomic_release_store android_atomic_write
 
-    
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/libcutils/Android.mk b/libcutils/Android.mk
index 4c45cc92e..5b05a1e51 100644
--- a/libcutils/Android.mk
+++ b/libcutils/Android.mk
@@ -16,6 +16,13 @@
 LOCAL_PATH := $(my-dir)
 include $(CLEAR_VARS)
 
+ifeq ($(TARGET_CPU_SMP),true)
+    targetSmpFlag := -DANDROID_SMP=1
+else
+    targetSmpFlag := -DANDROID_SMP=0
+endif
+hostSmpFlag := -DANDROID_SMP=0
+
 commonSources := \
 	array.c \
 	hashmap.c \
@@ -80,6 +87,7 @@ LOCAL_MODULE := libcutils
 LOCAL_SRC_FILES := $(commonSources) $(commonHostSources)
 LOCAL_LDLIBS := -lpthread
 LOCAL_STATIC_LIBRARIES := liblog
+LOCAL_CFLAGS += $(hostSmpFlag)
 include $(BUILD_HOST_STATIC_LIBRARY)
 
 
@@ -92,6 +100,7 @@ LOCAL_MODULE := libcutils
 LOCAL_SRC_FILES := $(commonSources) $(commonHostSources) memory.c dlmalloc_stubs.c
 LOCAL_LDLIBS := -lpthread
 LOCAL_SHARED_LIBRARIES := liblog
+LOCAL_CFLAGS += $(targetSmpFlag)
 include $(BUILD_SHARED_LIBRARY)
 
 else #!sim
@@ -114,12 +123,14 @@ endif # !arm
 
 LOCAL_C_INCLUDES := $(KERNEL_HEADERS)
 LOCAL_STATIC_LIBRARIES := liblog
+LOCAL_CFLAGS += $(targetSmpFlag)
 include $(BUILD_STATIC_LIBRARY)
 
 include $(CLEAR_VARS)
 LOCAL_MODULE := libcutils
 LOCAL_WHOLE_STATIC_LIBRARIES := libcutils
 LOCAL_SHARED_LIBRARIES := liblog
+LOCAL_CFLAGS += $(targetSmpFlag)
 include $(BUILD_SHARED_LIBRARY)
 
 endif #!sim
diff --git a/libcutils/atomic-android-arm.S b/libcutils/atomic-android-arm.S
index 1dd2363d6..f918990c8 100644
--- a/libcutils/atomic-android-arm.S
+++ b/libcutils/atomic-android-arm.S
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+/* TODO: insert memory barriers on SMP */
+
 #include <machine/cpu-features.h>
 
 /*
@@ -43,6 +45,8 @@
 	
 	.global android_atomic_cmpxchg
 	.type android_atomic_cmpxchg, %function
+	.global android_atomic_acquire_cmpxchg
+	.type android_atomic_acquire_cmpxchg, %function
 
 /*
  * ----------------------------------------------------------------------------
@@ -237,7 +241,7 @@ android_atomic_or:
 
 /* replaced swp instruction with ldrex/strex for ARMv6 & ARMv7 */
 android_atomic_swap:
-#if defined (_ARM_HAVE_LDREX_STREX)
+#if defined (__ARM_HAVE_LDREX_STREX)
 1:  ldrex   r2, [r1]
     strex   r3, r0, [r1]
     teq     r3, #0
@@ -256,6 +260,7 @@ android_atomic_swap:
  * output: r0 = 0 (xchg done) or non-zero (xchg not done)
  */
 
+android_atomic_acquire_cmpxchg:
 android_atomic_cmpxchg:
     .fnstart
     .save {r4, lr}
@@ -282,10 +287,3 @@ android_atomic_cmpxchg:
     bx      lr
     .fnend
 
-/*
- * ----------------------------------------------------------------------------
- * android_atomic_cmpxchg_64
- * input: r0-r1=oldvalue, r2-r3=newvalue, arg4 (on stack)=address
- * output: r0 = 0 (xchg done) or non-zero (xchg not done)
- */
-/* TODO: NEED IMPLEMENTATION FOR THIS ARCHITECTURE */
diff --git a/libcutils/atomic-android-sh.c b/libcutils/atomic-android-sh.c
index acbea976d..d95b02bdb 100644
--- a/libcutils/atomic-android-sh.c
+++ b/libcutils/atomic-android-sh.c
@@ -118,42 +118,8 @@ int android_atomic_cmpxchg(int32_t oldvalue, int32_t newvalue,
     return result;
 }
 
-int64_t android_quasiatomic_swap_64(int64_t value, volatile int64_t* addr) {
-    int64_t oldValue;
-    pthread_mutex_t*  lock = SWAP_LOCK(addr);
-
-    pthread_mutex_lock(lock);
-
-    oldValue = *addr;
-    *addr    = value;
-
-    pthread_mutex_unlock(lock);
-    return oldValue;
+int android_atomic_acquire_cmpxchg(int32_t oldvalue, int32_t newvalue,
+                           volatile int32_t* addr) {
+    return android_atomic_cmpxchg(oldValue, newValue, addr);
 }
 
-int android_quasiatomic_cmpxchg_64(int64_t oldvalue, int64_t newvalue,
-        volatile int64_t* addr) {
-    int result;
-    pthread_mutex_t*  lock = SWAP_LOCK(addr);
-
-    pthread_mutex_lock(lock);
-
-    if (*addr == oldvalue) {
-        *addr  = newvalue;
-        result = 0;
-    } else {
-        result = 1;
-    }
-    pthread_mutex_unlock(lock);
-    return result;
-}
-
-int64_t android_quasiatomic_read_64(volatile int64_t* addr) {
-    int64_t result;
-    pthread_mutex_t*  lock = SWAP_LOCK(addr);
-
-    pthread_mutex_lock(lock);
-    result = *addr;
-    pthread_mutex_unlock(lock);
-    return result;
-}
diff --git a/libcutils/atomic.c b/libcutils/atomic.c
index 41faaa282..d81890614 100644
--- a/libcutils/atomic.c
+++ b/libcutils/atomic.c
@@ -15,6 +15,7 @@
  */
 
 #include <cutils/atomic.h>
+#include <cutils/atomic-inline.h>
 #ifdef HAVE_WIN32_THREADS
 #include <windows.h>
 #else
@@ -70,40 +71,19 @@ int32_t android_atomic_swap(int32_t value, volatile int32_t* addr) {
 }
 
 int android_atomic_cmpxchg(int32_t oldvalue, int32_t newvalue, volatile int32_t* addr) {
+    /* OS X CAS returns zero on failure; invert to return zero on success */
     return OSAtomicCompareAndSwap32Barrier(oldvalue, newvalue, (int32_t*)addr) == 0;
 }
 
-#if defined(__ppc__)        \
-    || defined(__PPC__)     \
-    || defined(__powerpc__) \
-    || defined(__powerpc)   \
-    || defined(__POWERPC__) \
-    || defined(_M_PPC)      \
-    || defined(__PPC)
-#define NEED_QUASIATOMICS 1
-#else
-
-int android_quasiatomic_cmpxchg_64(int64_t oldvalue, int64_t newvalue,
-        volatile int64_t* addr) {
-    return OSAtomicCompareAndSwap64Barrier(oldvalue, newvalue,
-            (int64_t*)addr) == 0;
+int android_atomic_acquire_cmpxchg(int32_t oldvalue, int32_t newvalue,
+        volatile int32_t* addr) {
+    int result = (OSAtomicCompareAndSwap32(oldvalue, newvalue, (int32_t*)addr) == 0);
+    if (!result) {
+        /* success, perform barrier */
+        OSMemoryBarrier();
+    }
 }
 
-int64_t android_quasiatomic_swap_64(int64_t value, volatile int64_t* addr) {
-    int64_t oldValue;
-    do {
-        oldValue = *addr;
-    } while (android_quasiatomic_cmpxchg_64(oldValue, value, addr));
-    return oldValue;
-}
-
-int64_t android_quasiatomic_read_64(volatile int64_t* addr) {
-    return OSAtomicAdd64Barrier(0, addr);
-}    
-
-#endif
-
-
 /*****************************************************************************/
 #elif defined(__i386__) || defined(__x86_64__)
 
@@ -163,6 +143,7 @@ int32_t android_atomic_swap(int32_t value, volatile int32_t* addr) {
 }
 
 int android_atomic_cmpxchg(int32_t oldvalue, int32_t newvalue, volatile int32_t* addr) {
+    android_membar_full();
     int xchg;
     asm volatile
     (
@@ -175,75 +156,25 @@ int android_atomic_cmpxchg(int32_t oldvalue, int32_t newvalue, volatile int32_t*
     return xchg;
 }
 
-#define NEED_QUASIATOMICS 1
+int android_atomic_acquire_cmpxchg(int32_t oldvalue, int32_t newvalue,
+        volatile int32_t* addr) {
+    int xchg;
+    asm volatile
+    (
+    "   lock; cmpxchg %%ecx, (%%edx);"
+    "   setne %%al;"
+    "   andl $1, %%eax"
+    : "=a" (xchg)
+    : "a" (oldvalue), "c" (newvalue), "d" (addr)
+    );
+    android_membar_full();
+    return xchg;
+}
+
 
 /*****************************************************************************/
 #elif __arm__
-// Most of the implementation is in atomic-android-arm.s.
-
-// on the device, we implement the 64-bit atomic operations through
-// mutex locking. normally, this is bad because we must initialize
-// a pthread_mutex_t before being able to use it, and this means
-// having to do an initialization check on each function call, and
-// that's where really ugly things begin...
-//
-// BUT, as a special twist, we take advantage of the fact that in our
-// pthread library, a mutex is simply a volatile word whose value is always
-// initialized to 0. In other words, simply declaring a static mutex
-// object initializes it !
-//
-// another twist is that we use a small array of mutexes to dispatch
-// the contention locks from different memory addresses
-//
-
-#include <pthread.h>
-
-#define  SWAP_LOCK_COUNT  32U
-static pthread_mutex_t  _swap_locks[SWAP_LOCK_COUNT];
-
-#define  SWAP_LOCK(addr)   \
-   &_swap_locks[((unsigned)(void*)(addr) >> 3U) % SWAP_LOCK_COUNT]
-
-
-int64_t android_quasiatomic_swap_64(int64_t value, volatile int64_t* addr) {
-    int64_t oldValue;
-    pthread_mutex_t*  lock = SWAP_LOCK(addr);
-
-    pthread_mutex_lock(lock);
-
-    oldValue = *addr;
-    *addr    = value;
-
-    pthread_mutex_unlock(lock);
-    return oldValue;
-}
-
-int android_quasiatomic_cmpxchg_64(int64_t oldvalue, int64_t newvalue,
-        volatile int64_t* addr) {
-    int result;
-    pthread_mutex_t*  lock = SWAP_LOCK(addr);
-
-    pthread_mutex_lock(lock);
-
-    if (*addr == oldvalue) {
-        *addr  = newvalue;
-        result = 0;
-    } else {
-        result = 1;
-    }
-    pthread_mutex_unlock(lock);
-    return result;
-}
-
-int64_t android_quasiatomic_read_64(volatile int64_t* addr) {
-    int64_t result;
-    pthread_mutex_t*  lock = SWAP_LOCK(addr);
-
-    pthread_mutex_lock(lock);
-    result = *addr;
-    pthread_mutex_unlock(lock);
-    return result;
-}    
+// implementation for ARM is in atomic-android-arm.s.
 
 /*****************************************************************************/
 #elif __sh__
@@ -255,85 +186,3 @@ int64_t android_quasiatomic_read_64(volatile int64_t* addr) {
 
 #endif
 
-
-
-#if NEED_QUASIATOMICS
-
-/* Note that a spinlock is *not* a good idea in general
- * since they can introduce subtle issues. For example,
- * a real-time thread trying to acquire a spinlock already
- * acquired by another thread will never yeld, making the
- * CPU loop endlessly!
- *
- * However, this code is only used on the Linux simulator
- * so it's probably ok for us.
- *
- * The alternative is to use a pthread mutex, but
- * these must be initialized before being used, and
- * then you have the problem of lazily initializing
- * a mutex without any other synchronization primitive.
- */
-
-/* global spinlock for all 64-bit quasiatomic operations */
-static int32_t quasiatomic_spinlock = 0;
-
-int android_quasiatomic_cmpxchg_64(int64_t oldvalue, int64_t newvalue,
-        volatile int64_t* addr) {
-    int result;
-    
-    while (android_atomic_cmpxchg(0, 1, &quasiatomic_spinlock)) {
-#ifdef HAVE_WIN32_THREADS
-        Sleep(0);
-#else        
-        sched_yield();
-#endif        
-    }
-
-    if (*addr == oldvalue) {
-        *addr = newvalue;
-        result = 0;
-    } else {
-        result = 1;
-    }
-
-    android_atomic_swap(0, &quasiatomic_spinlock);
-
-    return result;
-}
-
-int64_t android_quasiatomic_read_64(volatile int64_t* addr) {
-    int64_t result;
-    
-    while (android_atomic_cmpxchg(0, 1, &quasiatomic_spinlock)) {
-#ifdef HAVE_WIN32_THREADS
-        Sleep(0);
-#else
-        sched_yield();
-#endif
-    }
-
-    result = *addr;
-    android_atomic_swap(0, &quasiatomic_spinlock);
-
-    return result;
-}
-
-int64_t android_quasiatomic_swap_64(int64_t value, volatile int64_t* addr) {
-    int64_t result;
-    
-    while (android_atomic_cmpxchg(0, 1, &quasiatomic_spinlock)) {
-#ifdef HAVE_WIN32_THREADS
-        Sleep(0);
-#else
-        sched_yield();
-#endif
-    }
-
-    result = *addr;
-    *addr = value;
-    android_atomic_swap(0, &quasiatomic_spinlock);
-
-    return result;
-}
-
-#endif