diff --git a/amdgpu/amdgpu.h b/amdgpu/amdgpu.h
index e44d802b..08513061 100644
--- a/amdgpu/amdgpu.h
+++ b/amdgpu/amdgpu.h
@@ -124,6 +124,11 @@ typedef struct amdgpu_bo_list *amdgpu_bo_list_handle;
  */
 typedef struct amdgpu_va *amdgpu_va_handle;
 
+/**
+ * Define handle for semaphore
+ */
+typedef struct amdgpu_semaphore *amdgpu_semaphore_handle;
+
 /*--------------------------------------------------------------------------*/
 /* -------------------------- Structures ---------------------------------- */
 /*--------------------------------------------------------------------------*/
@@ -1180,4 +1185,64 @@ int amdgpu_bo_va_op(amdgpu_bo_handle bo,
 		    uint64_t flags,
 		    uint32_t ops);
 
+/**
+ *  create semaphore
+ *
+ * \param   sem	   - \c [out] semaphore handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_create_semaphore(amdgpu_semaphore_handle *sem);
+
+/**
+ *  signal semaphore
+ *
+ * \param   context        - \c [in] GPU Context
+ * \param   ip_type        - \c [in] Hardware IP block type = AMDGPU_HW_IP_*
+ * \param   ip_instance    - \c [in] Index of the IP block of the same type
+ * \param   ring           - \c [in] Specify ring index of the IP
+ * \param   sem	           - \c [in] semaphore handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_signal_semaphore(amdgpu_context_handle ctx,
+			       uint32_t ip_type,
+			       uint32_t ip_instance,
+			       uint32_t ring,
+			       amdgpu_semaphore_handle sem);
+
+/**
+ *  wait semaphore
+ *
+ * \param   context        - \c [in] GPU Context
+ * \param   ip_type        - \c [in] Hardware IP block type = AMDGPU_HW_IP_*
+ * \param   ip_instance    - \c [in] Index of the IP block of the same type
+ * \param   ring           - \c [in] Specify ring index of the IP
+ * \param   sem	           - \c [in] semaphore handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_wait_semaphore(amdgpu_context_handle ctx,
+			     uint32_t ip_type,
+			     uint32_t ip_instance,
+			     uint32_t ring,
+			     amdgpu_semaphore_handle sem);
+
+/**
+ *  destroy semaphore
+ *
+ * \param   sem	    - \c [in] semaphore handle
+ *
+ * \return   0 on success\n
+ *          <0 - Negative POSIX Error code
+ *
+*/
+int amdgpu_cs_destroy_semaphore(amdgpu_semaphore_handle sem);
+
 #endif /* #ifdef _AMDGPU_H_ */
diff --git a/amdgpu/amdgpu_cs.c b/amdgpu/amdgpu_cs.c
index 6747158c..1848ade5 100644
--- a/amdgpu/amdgpu_cs.c
+++ b/amdgpu/amdgpu_cs.c
@@ -40,6 +40,9 @@
 #include "amdgpu_drm.h"
 #include "amdgpu_internal.h"
 
+static int amdgpu_cs_unreference_sem(amdgpu_semaphore_handle sem);
+static int amdgpu_cs_reset_sem(amdgpu_semaphore_handle sem);
+
 /**
  * Create command submission context
  *
@@ -53,6 +56,7 @@ int amdgpu_cs_ctx_create(amdgpu_device_handle dev,
 {
 	struct amdgpu_context *gpu_context;
 	union drm_amdgpu_ctx args;
+	int i, j, k;
 	int r;
 
 	if (NULL == dev)
@@ -66,6 +70,10 @@ int amdgpu_cs_ctx_create(amdgpu_device_handle dev,
 
 	gpu_context->dev = dev;
 
+	r = pthread_mutex_init(&gpu_context->sequence_mutex, NULL);
+	if (r)
+		goto error;
+
 	/* Create the context */
 	memset(&args, 0, sizeof(args));
 	args.in.op = AMDGPU_CTX_OP_ALLOC_CTX;
@@ -74,11 +82,16 @@ int amdgpu_cs_ctx_create(amdgpu_device_handle dev,
 		goto error;
 
 	gpu_context->id = args.out.alloc.ctx_id;
+	for (i = 0; i < AMDGPU_HW_IP_NUM; i++)
+		for (j = 0; j < AMDGPU_HW_IP_INSTANCE_MAX_COUNT; j++)
+			for (k = 0; k < AMDGPU_CS_MAX_RINGS; k++)
+				list_inithead(&gpu_context->sem_list[i][j][k]);
 	*context = (amdgpu_context_handle)gpu_context;
 
 	return 0;
 
 error:
+	pthread_mutex_destroy(&gpu_context->sequence_mutex);
 	free(gpu_context);
 	return r;
 }
@@ -94,18 +107,32 @@ error:
 int amdgpu_cs_ctx_free(amdgpu_context_handle context)
 {
 	union drm_amdgpu_ctx args;
+	int i, j, k;
 	int r;
 
 	if (NULL == context)
 		return -EINVAL;
 
+	pthread_mutex_destroy(&context->sequence_mutex);
+
 	/* now deal with kernel side */
 	memset(&args, 0, sizeof(args));
 	args.in.op = AMDGPU_CTX_OP_FREE_CTX;
 	args.in.ctx_id = context->id;
 	r = drmCommandWriteRead(context->dev->fd, DRM_AMDGPU_CTX,
 				&args, sizeof(args));
-
+	for (i = 0; i < AMDGPU_HW_IP_NUM; i++) {
+		for (j = 0; j < AMDGPU_HW_IP_INSTANCE_MAX_COUNT; j++) {
+			for (k = 0; k < AMDGPU_CS_MAX_RINGS; k++) {
+				amdgpu_semaphore_handle sem;
+				LIST_FOR_EACH_ENTRY(sem, &context->sem_list[i][j][k], list) {
+					list_del(&sem->list);
+					amdgpu_cs_reset_sem(sem);
+					amdgpu_cs_unreference_sem(sem);
+				}
+			}
+		}
+	}
 	free(context);
 
 	return r;
@@ -150,7 +177,10 @@ static int amdgpu_cs_submit_one(amdgpu_context_handle context,
 	struct drm_amdgpu_cs_chunk *chunks;
 	struct drm_amdgpu_cs_chunk_data *chunk_data;
 	struct drm_amdgpu_cs_chunk_dep *dependencies = NULL;
-	uint32_t i, size;
+	struct drm_amdgpu_cs_chunk_dep *sem_dependencies = NULL;
+	struct list_head *sem_list;
+	amdgpu_semaphore_handle sem;
+	uint32_t i, size, sem_count = 0;
 	bool user_fence;
 	int r = 0;
 
@@ -162,7 +192,7 @@ static int amdgpu_cs_submit_one(amdgpu_context_handle context,
 		return -EINVAL;
 	user_fence = (ibs_request->fence_info.handle != NULL);
 
-	size = ibs_request->number_of_ibs + (user_fence ? 2 : 1);
+	size = ibs_request->number_of_ibs + (user_fence ? 2 : 1) + 1;
 
 	chunk_array = alloca(sizeof(uint64_t) * size);
 	chunks = alloca(sizeof(struct drm_amdgpu_cs_chunk) * size);
@@ -196,6 +226,8 @@ static int amdgpu_cs_submit_one(amdgpu_context_handle context,
 		chunk_data[i].ib_data.flags = ib->flags;
 	}
 
+	pthread_mutex_lock(&context->sequence_mutex);
+
 	if (user_fence) {
 		i = cs.in.num_chunks++;
 
@@ -240,15 +272,49 @@ static int amdgpu_cs_submit_one(amdgpu_context_handle context,
 		chunks[i].chunk_data = (uint64_t)(uintptr_t)dependencies;
 	}
 
+	sem_list = &context->sem_list[ibs_request->ip_type][ibs_request->ip_instance][ibs_request->ring];
+	LIST_FOR_EACH_ENTRY(sem, sem_list, list)
+		sem_count++;
+	if (sem_count) {
+		sem_dependencies = malloc(sizeof(struct drm_amdgpu_cs_chunk_dep) * sem_count);
+		if (!sem_dependencies) {
+			r = -ENOMEM;
+			goto error_unlock;
+		}
+		sem_count = 0;
+		LIST_FOR_EACH_ENTRY(sem, sem_list, list) {
+			struct amdgpu_cs_fence *info = &sem->signal_fence;
+			struct drm_amdgpu_cs_chunk_dep *dep = &sem_dependencies[sem_count++];
+			dep->ip_type = info->ip_type;
+			dep->ip_instance = info->ip_instance;
+			dep->ring = info->ring;
+			dep->ctx_id = info->context->id;
+			dep->handle = info->fence;
+
+			list_del(&sem->list);
+			amdgpu_cs_reset_sem(sem);
+			amdgpu_cs_unreference_sem(sem);
+		}
+		i = cs.in.num_chunks++;
+
+		/* dependencies chunk */
+		chunk_array[i] = (uint64_t)(uintptr_t)&chunks[i];
+		chunks[i].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES;
+		chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_dep) / 4 * sem_count;
+		chunks[i].chunk_data = (uint64_t)(uintptr_t)sem_dependencies;
+	}
+
 	r = drmCommandWriteRead(context->dev->fd, DRM_AMDGPU_CS,
 				&cs, sizeof(cs));
 	if (r)
 		goto error_unlock;
 
 	ibs_request->seq_no = cs.out.handle;
-
+	context->last_seq[ibs_request->ip_type][ibs_request->ip_instance][ibs_request->ring] = ibs_request->seq_no;
 error_unlock:
+	pthread_mutex_unlock(&context->sequence_mutex);
 	free(dependencies);
+	free(sem_dependencies);
 	return r;
 }
 
@@ -369,3 +435,102 @@ int amdgpu_cs_query_fence_status(struct amdgpu_cs_fence *fence,
 	return r;
 }
 
+int amdgpu_cs_create_semaphore(amdgpu_semaphore_handle *sem)
+{
+	struct amdgpu_semaphore *gpu_semaphore;
+
+	if (NULL == sem)
+		return -EINVAL;
+
+	gpu_semaphore = calloc(1, sizeof(struct amdgpu_semaphore));
+	if (NULL == gpu_semaphore)
+		return -ENOMEM;
+
+	atomic_set(&gpu_semaphore->refcount, 1);
+	*sem = gpu_semaphore;
+
+	return 0;
+}
+
+int amdgpu_cs_signal_semaphore(amdgpu_context_handle ctx,
+			       uint32_t ip_type,
+			       uint32_t ip_instance,
+			       uint32_t ring,
+			       amdgpu_semaphore_handle sem)
+{
+	if (NULL == ctx)
+		return -EINVAL;
+	if (ip_type >= AMDGPU_HW_IP_NUM)
+		return -EINVAL;
+	if (ring >= AMDGPU_CS_MAX_RINGS)
+		return -EINVAL;
+	if (NULL == sem)
+		return -EINVAL;
+	/* sem has been signaled */
+	if (sem->signal_fence.context)
+		return -EINVAL;
+	pthread_mutex_lock(&ctx->sequence_mutex);
+	sem->signal_fence.context = ctx;
+	sem->signal_fence.ip_type = ip_type;
+	sem->signal_fence.ip_instance = ip_instance;
+	sem->signal_fence.ring = ring;
+	sem->signal_fence.fence = ctx->last_seq[ip_type][ip_instance][ring];
+	update_references(NULL, &sem->refcount);
+	pthread_mutex_unlock(&ctx->sequence_mutex);
+	return 0;
+}
+
+int amdgpu_cs_wait_semaphore(amdgpu_context_handle ctx,
+			     uint32_t ip_type,
+			     uint32_t ip_instance,
+			     uint32_t ring,
+			     amdgpu_semaphore_handle sem)
+{
+	if (NULL == ctx)
+		return -EINVAL;
+	if (ip_type >= AMDGPU_HW_IP_NUM)
+		return -EINVAL;
+	if (ring >= AMDGPU_CS_MAX_RINGS)
+		return -EINVAL;
+	if (NULL == sem)
+		return -EINVAL;
+	/* must signal first */
+	if (NULL == sem->signal_fence.context)
+		return -EINVAL;
+
+	pthread_mutex_lock(&ctx->sequence_mutex);
+	list_add(&sem->list, &ctx->sem_list[ip_type][ip_instance][ring]);
+	pthread_mutex_unlock(&ctx->sequence_mutex);
+	return 0;
+}
+
+static int amdgpu_cs_reset_sem(amdgpu_semaphore_handle sem)
+{
+	if (NULL == sem)
+		return -EINVAL;
+	if (NULL == sem->signal_fence.context)
+		return -EINVAL;
+
+	sem->signal_fence.context = NULL;;
+	sem->signal_fence.ip_type = 0;
+	sem->signal_fence.ip_instance = 0;
+	sem->signal_fence.ring = 0;
+	sem->signal_fence.fence = 0;
+
+	return 0;
+}
+
+static int amdgpu_cs_unreference_sem(amdgpu_semaphore_handle sem)
+{
+	if (NULL == sem)
+		return -EINVAL;
+
+	if (update_references(&sem->refcount, NULL))
+		free(sem);
+	return 0;
+}
+
+int amdgpu_cs_destroy_semaphore(amdgpu_semaphore_handle sem)
+{
+	return amdgpu_cs_unreference_sem(sem);
+}
diff --git a/amdgpu/amdgpu_internal.h b/amdgpu/amdgpu_internal.h
index 7dd5c1c7..557ba1f1 100644
--- a/amdgpu/amdgpu_internal.h
+++ b/amdgpu/amdgpu_internal.h
@@ -111,8 +111,23 @@ struct amdgpu_bo_list {
 
 struct amdgpu_context {
 	struct amdgpu_device *dev;
+	/** Mutex for accessing fences and to maintain command submissions
+	    in good sequence. */
+	pthread_mutex_t sequence_mutex;
 	/* context id*/
 	uint32_t id;
+	uint64_t last_seq[AMDGPU_HW_IP_NUM][AMDGPU_HW_IP_INSTANCE_MAX_COUNT][AMDGPU_CS_MAX_RINGS];
+	struct list_head sem_list[AMDGPU_HW_IP_NUM][AMDGPU_HW_IP_INSTANCE_MAX_COUNT][AMDGPU_CS_MAX_RINGS];
+};
+
+/**
+ * Structure describing sw semaphore based on scheduler
+ *
+ */
+struct amdgpu_semaphore {
+	atomic_t refcount;
+	struct list_head list;
+	struct amdgpu_cs_fence signal_fence;
 };
 
 /**