freedreno: add bo cache
Workloads which create many transient buffers cause significant CPU overhead in buffer allocation, zeroing, cache maint, and mmap setup. By caching and re-using existing buffers, the CPU overhead drops significantly. See: http://bloggingthemonkey.blogspot.com/2013/09/freedreno-update-moar-fps.html A simple time based policy is used for purging the cache. Once the kernel supports it, we could use madvise style API to handle memory pressure scenarios a bit better. Signed-off-by: Rob Clark <robclark@freedesktop.org>main
parent
1489811a80
commit
068ea68b3f
|
@ -31,6 +31,8 @@
|
|||
|
||||
static pthread_mutex_t table_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
static void bo_del(struct fd_bo *bo);
|
||||
|
||||
/* set buffer name, and add to table, call w/ table_lock held: */
|
||||
static void set_name(struct fd_bo *bo, uint32_t name)
|
||||
{
|
||||
|
@ -68,24 +70,128 @@ static struct fd_bo * bo_from_handle(struct fd_device *dev,
|
|||
bo->size = size;
|
||||
bo->handle = handle;
|
||||
atomic_set(&bo->refcnt, 1);
|
||||
list_inithead(&bo->list);
|
||||
/* add ourself into the handle table: */
|
||||
drmHashInsert(dev->handle_table, handle, bo);
|
||||
return bo;
|
||||
}
|
||||
|
||||
/* Frees older cached buffers. Called under table_lock */
|
||||
void fd_cleanup_bo_cache(struct fd_device *dev, time_t time)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (dev->time == time)
|
||||
return;
|
||||
|
||||
for (i = 0; i < dev->num_buckets; i++) {
|
||||
struct fd_bo_bucket *bucket = &dev->cache_bucket[i];
|
||||
struct fd_bo *bo;
|
||||
|
||||
while (!LIST_IS_EMPTY(&bucket->list)) {
|
||||
bo = LIST_ENTRY(struct fd_bo, bucket->list.next, list);
|
||||
|
||||
/* keep things in cache for at least 1 second: */
|
||||
if (time && ((time - bo->free_time) <= 1))
|
||||
break;
|
||||
|
||||
list_del(&bo->list);
|
||||
bo_del(bo);
|
||||
}
|
||||
}
|
||||
|
||||
dev->time = time;
|
||||
}
|
||||
|
||||
static struct fd_bo_bucket * get_bucket(struct fd_device *dev, uint32_t size)
|
||||
{
|
||||
int i;
|
||||
|
||||
/* hmm, this is what intel does, but I suppose we could calculate our
|
||||
* way to the correct bucket size rather than looping..
|
||||
*/
|
||||
for (i = 0; i < dev->num_buckets; i++) {
|
||||
struct fd_bo_bucket *bucket = &dev->cache_bucket[i];
|
||||
if (bucket->size >= size) {
|
||||
return bucket;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int is_idle(struct fd_bo *bo)
|
||||
{
|
||||
return fd_bo_cpu_prep(bo, NULL,
|
||||
DRM_FREEDRENO_PREP_READ |
|
||||
DRM_FREEDRENO_PREP_WRITE |
|
||||
DRM_FREEDRENO_PREP_NOSYNC) == 0;
|
||||
}
|
||||
|
||||
static struct fd_bo *find_in_bucket(struct fd_device *dev,
|
||||
struct fd_bo_bucket *bucket, uint32_t flags)
|
||||
{
|
||||
struct fd_bo *bo = NULL;
|
||||
|
||||
/* TODO .. if we had an ALLOC_FOR_RENDER flag like intel, we could
|
||||
* skip the busy check.. if it is only going to be a render target
|
||||
* then we probably don't need to stall..
|
||||
*
|
||||
* NOTE that intel takes ALLOC_FOR_RENDER bo's from the list tail
|
||||
* (MRU, since likely to be in GPU cache), rather than head (LRU)..
|
||||
*/
|
||||
pthread_mutex_lock(&table_lock);
|
||||
while (!LIST_IS_EMPTY(&bucket->list)) {
|
||||
bo = LIST_ENTRY(struct fd_bo, bucket->list.next, list);
|
||||
if (0 /* TODO: if madvise tells us bo is gone... */) {
|
||||
list_del(&bo->list);
|
||||
bo_del(bo);
|
||||
bo = NULL;
|
||||
continue;
|
||||
}
|
||||
/* TODO check for compatible flags? */
|
||||
if (is_idle(bo)) {
|
||||
list_del(&bo->list);
|
||||
break;
|
||||
}
|
||||
bo = NULL;
|
||||
break;
|
||||
}
|
||||
pthread_mutex_unlock(&table_lock);
|
||||
|
||||
return bo;
|
||||
}
|
||||
|
||||
|
||||
struct fd_bo * fd_bo_new(struct fd_device *dev,
|
||||
uint32_t size, uint32_t flags)
|
||||
{
|
||||
struct fd_bo *bo = NULL;
|
||||
struct fd_bo_bucket *bucket;
|
||||
uint32_t handle;
|
||||
int ret;
|
||||
|
||||
ret = dev->funcs->bo_new_handle(dev, ALIGN(size, 4096), flags, &handle);
|
||||
size = ALIGN(size, 4096);
|
||||
bucket = get_bucket(dev, size);
|
||||
|
||||
/* see if we can be green and recycle: */
|
||||
if (bucket) {
|
||||
size = bucket->size;
|
||||
bo = find_in_bucket(dev, bucket, flags);
|
||||
if (bo) {
|
||||
atomic_set(&bo->refcnt, 1);
|
||||
fd_device_ref(bo->dev);
|
||||
return bo;
|
||||
}
|
||||
}
|
||||
|
||||
ret = dev->funcs->bo_new_handle(dev, size, flags, &handle);
|
||||
if (ret)
|
||||
return NULL;
|
||||
|
||||
pthread_mutex_lock(&table_lock);
|
||||
bo = bo_from_handle(dev, size, handle);
|
||||
bo->bo_reuse = 1;
|
||||
pthread_mutex_unlock(&table_lock);
|
||||
|
||||
return bo;
|
||||
|
@ -144,30 +250,61 @@ struct fd_bo * fd_bo_ref(struct fd_bo *bo)
|
|||
|
||||
void fd_bo_del(struct fd_bo *bo)
|
||||
{
|
||||
struct fd_device *dev;
|
||||
struct fd_device *dev = bo->dev;
|
||||
|
||||
if (!atomic_dec_and_test(&bo->refcnt))
|
||||
return;
|
||||
|
||||
pthread_mutex_lock(&table_lock);
|
||||
|
||||
if (bo->bo_reuse) {
|
||||
struct fd_bo_bucket *bucket = get_bucket(dev, bo->size);
|
||||
|
||||
/* see if we can be green and recycle: */
|
||||
if (bucket) {
|
||||
struct timespec time;
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &time);
|
||||
|
||||
bo->free_time = time.tv_sec;
|
||||
list_addtail(&bo->list, &bucket->list);
|
||||
fd_cleanup_bo_cache(dev, time.tv_sec);
|
||||
|
||||
/* bo's in the bucket cache don't have a ref and
|
||||
* don't hold a ref to the dev:
|
||||
*/
|
||||
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
bo_del(bo);
|
||||
out:
|
||||
fd_device_del_locked(dev);
|
||||
pthread_mutex_unlock(&table_lock);
|
||||
}
|
||||
|
||||
/* Called under table_lock */
|
||||
static void bo_del(struct fd_bo *bo)
|
||||
{
|
||||
if (bo->map)
|
||||
munmap(bo->map, bo->size);
|
||||
|
||||
/* TODO probably bo's in bucket list get removed from
|
||||
* handle table??
|
||||
*/
|
||||
|
||||
if (bo->handle) {
|
||||
struct drm_gem_close req = {
|
||||
.handle = bo->handle,
|
||||
};
|
||||
pthread_mutex_lock(&table_lock);
|
||||
drmHashDelete(bo->dev->handle_table, bo->handle);
|
||||
if (bo->name)
|
||||
drmHashDelete(bo->dev->name_table, bo->name);
|
||||
drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &req);
|
||||
pthread_mutex_unlock(&table_lock);
|
||||
}
|
||||
|
||||
dev = bo->dev;
|
||||
bo->funcs->destroy(bo);
|
||||
|
||||
fd_device_del(dev);
|
||||
}
|
||||
|
||||
int fd_bo_get_name(struct fd_bo *bo, uint32_t *name)
|
||||
|
|
|
@ -39,6 +39,44 @@ static void * dev_table;
|
|||
struct fd_device * kgsl_device_new(int fd);
|
||||
struct fd_device * msm_device_new(int fd);
|
||||
|
||||
static void
|
||||
add_bucket(struct fd_device *dev, int size)
|
||||
{
|
||||
unsigned int i = dev->num_buckets;
|
||||
|
||||
assert(i < ARRAY_SIZE(dev->cache_bucket));
|
||||
|
||||
list_inithead(&dev->cache_bucket[i].list);
|
||||
dev->cache_bucket[i].size = size;
|
||||
dev->num_buckets++;
|
||||
}
|
||||
|
||||
static void
|
||||
init_cache_buckets(struct fd_device *dev)
|
||||
{
|
||||
unsigned long size, cache_max_size = 64 * 1024 * 1024;
|
||||
|
||||
/* OK, so power of two buckets was too wasteful of memory.
|
||||
* Give 3 other sizes between each power of two, to hopefully
|
||||
* cover things accurately enough. (The alternative is
|
||||
* probably to just go for exact matching of sizes, and assume
|
||||
* that for things like composited window resize the tiled
|
||||
* width/height alignment and rounding of sizes to pages will
|
||||
* get us useful cache hit rates anyway)
|
||||
*/
|
||||
add_bucket(dev, 4096);
|
||||
add_bucket(dev, 4096 * 2);
|
||||
add_bucket(dev, 4096 * 3);
|
||||
|
||||
/* Initialize the linked lists for BO reuse cache. */
|
||||
for (size = 4 * 4096; size <= cache_max_size; size *= 2) {
|
||||
add_bucket(dev, size);
|
||||
add_bucket(dev, size + size * 1 / 4);
|
||||
add_bucket(dev, size + size * 2 / 4);
|
||||
add_bucket(dev, size + size * 3 / 4);
|
||||
}
|
||||
}
|
||||
|
||||
static struct fd_device * fd_device_new_impl(int fd)
|
||||
{
|
||||
struct fd_device *dev;
|
||||
|
@ -69,6 +107,7 @@ static struct fd_device * fd_device_new_impl(int fd)
|
|||
dev->fd = fd;
|
||||
dev->handle_table = drmHashCreate();
|
||||
dev->name_table = drmHashCreate();
|
||||
init_cache_buckets(dev);
|
||||
|
||||
return dev;
|
||||
}
|
||||
|
@ -102,14 +141,27 @@ struct fd_device * fd_device_ref(struct fd_device *dev)
|
|||
return dev;
|
||||
}
|
||||
|
||||
static void fd_device_del_impl(struct fd_device *dev)
|
||||
{
|
||||
fd_cleanup_bo_cache(dev, 0);
|
||||
drmHashDestroy(dev->handle_table);
|
||||
drmHashDestroy(dev->name_table);
|
||||
drmHashDelete(dev_table, dev->fd);
|
||||
dev->funcs->destroy(dev);
|
||||
}
|
||||
|
||||
void fd_device_del_locked(struct fd_device *dev)
|
||||
{
|
||||
if (!atomic_dec_and_test(&dev->refcnt))
|
||||
return;
|
||||
fd_device_del_impl(dev);
|
||||
}
|
||||
|
||||
void fd_device_del(struct fd_device *dev)
|
||||
{
|
||||
if (!atomic_dec_and_test(&dev->refcnt))
|
||||
return;
|
||||
pthread_mutex_lock(&table_lock);
|
||||
drmHashDestroy(dev->handle_table);
|
||||
drmHashDestroy(dev->name_table);
|
||||
drmHashDelete(dev_table, dev->fd);
|
||||
fd_device_del_impl(dev);
|
||||
pthread_mutex_unlock(&table_lock);
|
||||
dev->funcs->destroy(dev);
|
||||
}
|
||||
|
|
|
@ -59,6 +59,11 @@ struct fd_device_funcs {
|
|||
void (*destroy)(struct fd_device *dev);
|
||||
};
|
||||
|
||||
struct fd_bo_bucket {
|
||||
uint32_t size;
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
struct fd_device {
|
||||
int fd;
|
||||
atomic_t refcnt;
|
||||
|
@ -75,8 +80,17 @@ struct fd_device {
|
|||
void *handle_table, *name_table;
|
||||
|
||||
struct fd_device_funcs *funcs;
|
||||
|
||||
struct fd_bo_bucket cache_bucket[14 * 4];
|
||||
int num_buckets;
|
||||
time_t time;
|
||||
};
|
||||
|
||||
void fd_cleanup_bo_cache(struct fd_device *dev, time_t time);
|
||||
|
||||
/* for where @table_lock is already held: */
|
||||
void fd_device_del_locked(struct fd_device *dev);
|
||||
|
||||
struct fd_pipe_funcs {
|
||||
struct fd_ringbuffer * (*ringbuffer_new)(struct fd_pipe *pipe, uint32_t size);
|
||||
int (*get_param)(struct fd_pipe *pipe, enum fd_param_id param, uint64_t *value);
|
||||
|
@ -120,6 +134,10 @@ struct fd_bo {
|
|||
void *map;
|
||||
atomic_t refcnt;
|
||||
struct fd_bo_funcs *funcs;
|
||||
|
||||
int bo_reuse;
|
||||
struct list_head list; /* bucket-list entry */
|
||||
time_t free_time; /* time when added to bucket-list */
|
||||
};
|
||||
|
||||
struct fd_bo *fd_bo_from_handle(struct fd_device *dev,
|
||||
|
|
|
@ -80,9 +80,24 @@ static int kgsl_bo_offset(struct fd_bo *bo, uint64_t *offset)
|
|||
static int kgsl_bo_cpu_prep(struct fd_bo *bo, struct fd_pipe *pipe, uint32_t op)
|
||||
{
|
||||
uint32_t timestamp = kgsl_bo_get_timestamp(to_kgsl_bo(bo));
|
||||
if (timestamp) {
|
||||
fd_pipe_wait(pipe, timestamp);
|
||||
|
||||
if (op & DRM_FREEDRENO_PREP_NOSYNC) {
|
||||
uint32_t current;
|
||||
int ret;
|
||||
|
||||
ret = kgsl_pipe_timestamp(to_kgsl_pipe(pipe), ¤t);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (timestamp > current)
|
||||
return -EBUSY;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (timestamp)
|
||||
fd_pipe_wait(pipe, timestamp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue