freedreno: add bo cache

Workloads which create many transient buffers cause significant CPU
overhead in buffer allocation, zeroing, cache maint, and mmap setup.
By caching and re-using existing buffers, the CPU overhead drops
significantly.  See:

http://bloggingthemonkey.blogspot.com/2013/09/freedreno-update-moar-fps.html

A simple time based policy is used for purging the cache.  Once the
kernel supports it, we could use madvise style API to handle memory
pressure scenarios a bit better.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
main
Rob Clark 2013-12-13 12:48:30 -05:00
parent 1489811a80
commit 068ea68b3f
4 changed files with 235 additions and 13 deletions

View File

@ -31,6 +31,8 @@
static pthread_mutex_t table_lock = PTHREAD_MUTEX_INITIALIZER;
static void bo_del(struct fd_bo *bo);
/* set buffer name, and add to table, call w/ table_lock held: */
static void set_name(struct fd_bo *bo, uint32_t name)
{
@ -68,24 +70,128 @@ static struct fd_bo * bo_from_handle(struct fd_device *dev,
bo->size = size;
bo->handle = handle;
atomic_set(&bo->refcnt, 1);
list_inithead(&bo->list);
/* add ourself into the handle table: */
drmHashInsert(dev->handle_table, handle, bo);
return bo;
}
/* Frees older cached buffers. Called under table_lock */
void fd_cleanup_bo_cache(struct fd_device *dev, time_t time)
{
int i;
if (dev->time == time)
return;
for (i = 0; i < dev->num_buckets; i++) {
struct fd_bo_bucket *bucket = &dev->cache_bucket[i];
struct fd_bo *bo;
while (!LIST_IS_EMPTY(&bucket->list)) {
bo = LIST_ENTRY(struct fd_bo, bucket->list.next, list);
/* keep things in cache for at least 1 second: */
if (time && ((time - bo->free_time) <= 1))
break;
list_del(&bo->list);
bo_del(bo);
}
}
dev->time = time;
}
static struct fd_bo_bucket * get_bucket(struct fd_device *dev, uint32_t size)
{
int i;
/* hmm, this is what intel does, but I suppose we could calculate our
* way to the correct bucket size rather than looping..
*/
for (i = 0; i < dev->num_buckets; i++) {
struct fd_bo_bucket *bucket = &dev->cache_bucket[i];
if (bucket->size >= size) {
return bucket;
}
}
return NULL;
}
static int is_idle(struct fd_bo *bo)
{
return fd_bo_cpu_prep(bo, NULL,
DRM_FREEDRENO_PREP_READ |
DRM_FREEDRENO_PREP_WRITE |
DRM_FREEDRENO_PREP_NOSYNC) == 0;
}
static struct fd_bo *find_in_bucket(struct fd_device *dev,
struct fd_bo_bucket *bucket, uint32_t flags)
{
struct fd_bo *bo = NULL;
/* TODO .. if we had an ALLOC_FOR_RENDER flag like intel, we could
* skip the busy check.. if it is only going to be a render target
* then we probably don't need to stall..
*
* NOTE that intel takes ALLOC_FOR_RENDER bo's from the list tail
* (MRU, since likely to be in GPU cache), rather than head (LRU)..
*/
pthread_mutex_lock(&table_lock);
while (!LIST_IS_EMPTY(&bucket->list)) {
bo = LIST_ENTRY(struct fd_bo, bucket->list.next, list);
if (0 /* TODO: if madvise tells us bo is gone... */) {
list_del(&bo->list);
bo_del(bo);
bo = NULL;
continue;
}
/* TODO check for compatible flags? */
if (is_idle(bo)) {
list_del(&bo->list);
break;
}
bo = NULL;
break;
}
pthread_mutex_unlock(&table_lock);
return bo;
}
struct fd_bo * fd_bo_new(struct fd_device *dev,
uint32_t size, uint32_t flags)
{
struct fd_bo *bo = NULL;
struct fd_bo_bucket *bucket;
uint32_t handle;
int ret;
ret = dev->funcs->bo_new_handle(dev, ALIGN(size, 4096), flags, &handle);
size = ALIGN(size, 4096);
bucket = get_bucket(dev, size);
/* see if we can be green and recycle: */
if (bucket) {
size = bucket->size;
bo = find_in_bucket(dev, bucket, flags);
if (bo) {
atomic_set(&bo->refcnt, 1);
fd_device_ref(bo->dev);
return bo;
}
}
ret = dev->funcs->bo_new_handle(dev, size, flags, &handle);
if (ret)
return NULL;
pthread_mutex_lock(&table_lock);
bo = bo_from_handle(dev, size, handle);
bo->bo_reuse = 1;
pthread_mutex_unlock(&table_lock);
return bo;
@ -144,30 +250,61 @@ struct fd_bo * fd_bo_ref(struct fd_bo *bo)
void fd_bo_del(struct fd_bo *bo)
{
struct fd_device *dev;
struct fd_device *dev = bo->dev;
if (!atomic_dec_and_test(&bo->refcnt))
return;
pthread_mutex_lock(&table_lock);
if (bo->bo_reuse) {
struct fd_bo_bucket *bucket = get_bucket(dev, bo->size);
/* see if we can be green and recycle: */
if (bucket) {
struct timespec time;
clock_gettime(CLOCK_MONOTONIC, &time);
bo->free_time = time.tv_sec;
list_addtail(&bo->list, &bucket->list);
fd_cleanup_bo_cache(dev, time.tv_sec);
/* bo's in the bucket cache don't have a ref and
* don't hold a ref to the dev:
*/
goto out;
}
}
bo_del(bo);
out:
fd_device_del_locked(dev);
pthread_mutex_unlock(&table_lock);
}
/* Called under table_lock */
static void bo_del(struct fd_bo *bo)
{
if (bo->map)
munmap(bo->map, bo->size);
/* TODO probably bo's in bucket list get removed from
* handle table??
*/
if (bo->handle) {
struct drm_gem_close req = {
.handle = bo->handle,
};
pthread_mutex_lock(&table_lock);
drmHashDelete(bo->dev->handle_table, bo->handle);
if (bo->name)
drmHashDelete(bo->dev->name_table, bo->name);
drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &req);
pthread_mutex_unlock(&table_lock);
}
dev = bo->dev;
bo->funcs->destroy(bo);
fd_device_del(dev);
}
int fd_bo_get_name(struct fd_bo *bo, uint32_t *name)

View File

@ -39,6 +39,44 @@ static void * dev_table;
struct fd_device * kgsl_device_new(int fd);
struct fd_device * msm_device_new(int fd);
static void
add_bucket(struct fd_device *dev, int size)
{
unsigned int i = dev->num_buckets;
assert(i < ARRAY_SIZE(dev->cache_bucket));
list_inithead(&dev->cache_bucket[i].list);
dev->cache_bucket[i].size = size;
dev->num_buckets++;
}
static void
init_cache_buckets(struct fd_device *dev)
{
unsigned long size, cache_max_size = 64 * 1024 * 1024;
/* OK, so power of two buckets was too wasteful of memory.
* Give 3 other sizes between each power of two, to hopefully
* cover things accurately enough. (The alternative is
* probably to just go for exact matching of sizes, and assume
* that for things like composited window resize the tiled
* width/height alignment and rounding of sizes to pages will
* get us useful cache hit rates anyway)
*/
add_bucket(dev, 4096);
add_bucket(dev, 4096 * 2);
add_bucket(dev, 4096 * 3);
/* Initialize the linked lists for BO reuse cache. */
for (size = 4 * 4096; size <= cache_max_size; size *= 2) {
add_bucket(dev, size);
add_bucket(dev, size + size * 1 / 4);
add_bucket(dev, size + size * 2 / 4);
add_bucket(dev, size + size * 3 / 4);
}
}
static struct fd_device * fd_device_new_impl(int fd)
{
struct fd_device *dev;
@ -69,6 +107,7 @@ static struct fd_device * fd_device_new_impl(int fd)
dev->fd = fd;
dev->handle_table = drmHashCreate();
dev->name_table = drmHashCreate();
init_cache_buckets(dev);
return dev;
}
@ -102,14 +141,27 @@ struct fd_device * fd_device_ref(struct fd_device *dev)
return dev;
}
static void fd_device_del_impl(struct fd_device *dev)
{
fd_cleanup_bo_cache(dev, 0);
drmHashDestroy(dev->handle_table);
drmHashDestroy(dev->name_table);
drmHashDelete(dev_table, dev->fd);
dev->funcs->destroy(dev);
}
void fd_device_del_locked(struct fd_device *dev)
{
if (!atomic_dec_and_test(&dev->refcnt))
return;
fd_device_del_impl(dev);
}
void fd_device_del(struct fd_device *dev)
{
if (!atomic_dec_and_test(&dev->refcnt))
return;
pthread_mutex_lock(&table_lock);
drmHashDestroy(dev->handle_table);
drmHashDestroy(dev->name_table);
drmHashDelete(dev_table, dev->fd);
fd_device_del_impl(dev);
pthread_mutex_unlock(&table_lock);
dev->funcs->destroy(dev);
}

View File

@ -59,6 +59,11 @@ struct fd_device_funcs {
void (*destroy)(struct fd_device *dev);
};
struct fd_bo_bucket {
uint32_t size;
struct list_head list;
};
struct fd_device {
int fd;
atomic_t refcnt;
@ -75,8 +80,17 @@ struct fd_device {
void *handle_table, *name_table;
struct fd_device_funcs *funcs;
struct fd_bo_bucket cache_bucket[14 * 4];
int num_buckets;
time_t time;
};
void fd_cleanup_bo_cache(struct fd_device *dev, time_t time);
/* for where @table_lock is already held: */
void fd_device_del_locked(struct fd_device *dev);
struct fd_pipe_funcs {
struct fd_ringbuffer * (*ringbuffer_new)(struct fd_pipe *pipe, uint32_t size);
int (*get_param)(struct fd_pipe *pipe, enum fd_param_id param, uint64_t *value);
@ -120,6 +134,10 @@ struct fd_bo {
void *map;
atomic_t refcnt;
struct fd_bo_funcs *funcs;
int bo_reuse;
struct list_head list; /* bucket-list entry */
time_t free_time; /* time when added to bucket-list */
};
struct fd_bo *fd_bo_from_handle(struct fd_device *dev,

View File

@ -80,9 +80,24 @@ static int kgsl_bo_offset(struct fd_bo *bo, uint64_t *offset)
static int kgsl_bo_cpu_prep(struct fd_bo *bo, struct fd_pipe *pipe, uint32_t op)
{
uint32_t timestamp = kgsl_bo_get_timestamp(to_kgsl_bo(bo));
if (timestamp) {
fd_pipe_wait(pipe, timestamp);
if (op & DRM_FREEDRENO_PREP_NOSYNC) {
uint32_t current;
int ret;
ret = kgsl_pipe_timestamp(to_kgsl_pipe(pipe), &current);
if (ret)
return ret;
if (timestamp > current)
return -EBUSY;
return 0;
}
if (timestamp)
fd_pipe_wait(pipe, timestamp);
return 0;
}