/* * scst_mem.c * * Copyright (C) 2006 - 2018 Vladislav Bolkhovitin * Copyright (C) 2007 - 2018 Western Digital Corporation * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, version 2 * of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include #include #include #ifdef INSIDE_KERNEL_TREE #include #else #include "scst.h" #endif #include "scst_priv.h" #include "scst_mem.h" #define SGV_DEFAULT_PURGE_INTERVAL (60 * HZ) #define SGV_MIN_SHRINK_INTERVAL (1 * HZ) /* Max pages freed from a pool per shrinking iteration */ #define MAX_PAGES_PER_POOL 50 bool scst_force_global_sgv_pool; static struct sgv_pool *sgv_dma_pool_per_cpu[NR_CPUS]; static struct sgv_pool *sgv_norm_clust_pool_per_cpu[NR_CPUS]; static struct sgv_pool *sgv_norm_pool_per_cpu[NR_CPUS]; static struct sgv_pool *sgv_dma_pool_global[NR_CPUS]; static struct sgv_pool *sgv_norm_clust_pool_global[NR_CPUS]; static struct sgv_pool *sgv_norm_pool_global[NR_CPUS]; static struct sgv_pool *sgv_norm_clust_pool_main, *sgv_norm_pool_main, *sgv_dma_pool_main; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 29) #if defined(CONFIG_LOCKDEP) && !defined(CONFIG_SCST_PROC) static struct lock_class_key scst_pool_key; static struct lockdep_map scst_pool_dep_map = STATIC_LOCKDEP_MAP_INIT("scst_pool_kref", &scst_pool_key); #endif #endif #ifndef CONFIG_SCST_NO_TOTAL_MEM_CHECKS static atomic_t sgv_pages_total = ATOMIC_INIT(0); #endif /* Both read-only */ static int sgv_hi_wmk; static int sgv_lo_wmk; static int sgv_max_local_pages, sgv_max_trans_pages; static DEFINE_SPINLOCK(sgv_pools_lock); /* inner lock for sgv_pool_lock! */ static DEFINE_MUTEX(sgv_pools_mutex); static atomic_t sgv_releases_on_hiwmk = ATOMIC_INIT(0); static atomic_t sgv_releases_on_hiwmk_failed = ATOMIC_INIT(0); #ifndef CONFIG_SCST_NO_TOTAL_MEM_CHECKS static atomic_t sgv_other_total_alloc = ATOMIC_INIT(0); #endif #if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23)) static struct shrinker *sgv_shrinker; #else static struct shrinker sgv_shrinker; #endif static struct kmem_cache *sgv_pool_cachep; /* * Protected by sgv_pools_mutex AND sgv_pools_lock for writes, * either one for reads. */ static LIST_HEAD(sgv_pools_list); static struct kobject *scst_sgv_kobj; static int scst_sgv_sysfs_create(struct sgv_pool *pool); static void scst_sgv_sysfs_del(struct sgv_pool *pool); static inline bool sgv_pool_clustered(const struct sgv_pool *pool) { return pool->clustering_type != sgv_no_clustering; } void scst_sgv_pool_use_norm(struct scst_tgt_dev *tgt_dev) { tgt_dev->tgt_dev_gfp_mask = __GFP_NOWARN; if (!scst_force_global_sgv_pool) tgt_dev->pools = sgv_norm_pool_per_cpu; else tgt_dev->pools = sgv_norm_pool_global; tgt_dev->tgt_dev_clust_pool = 0; } void scst_sgv_pool_use_norm_clust(struct scst_tgt_dev *tgt_dev) { TRACE_MEM("%s", "Use clustering"); tgt_dev->tgt_dev_gfp_mask = __GFP_NOWARN; if (!scst_force_global_sgv_pool) tgt_dev->pools = sgv_norm_clust_pool_per_cpu; else tgt_dev->pools = sgv_norm_clust_pool_global; tgt_dev->tgt_dev_clust_pool = 1; } void scst_sgv_pool_use_dma(struct scst_tgt_dev *tgt_dev) { TRACE_MEM("%s", "Use ISA DMA memory"); tgt_dev->tgt_dev_gfp_mask = __GFP_NOWARN | GFP_DMA; if (!scst_force_global_sgv_pool) tgt_dev->pools = sgv_dma_pool_per_cpu; else tgt_dev->pools = sgv_dma_pool_global; tgt_dev->tgt_dev_clust_pool = 0; } /* Must be no locks */ static void sgv_dtor_and_free(struct sgv_pool_obj *obj) { struct sgv_pool *pool = obj->owner_pool; TRACE_MEM("Destroying sgv obj %p", obj); if (obj->sg_count != 0) { pool->alloc_fns.free_pages_fn(obj->sg_entries, obj->sg_count, obj->allocator_priv); } if (obj->sg_entries != obj->sg_entries_data) { if (obj->trans_tbl != (struct trans_tbl_ent *)obj->sg_entries_data) { /* kfree() handles NULL parameter */ kfree(obj->trans_tbl); obj->trans_tbl = NULL; } kfree(obj->sg_entries); } kmem_cache_free(pool->caches[obj->cache_num], obj); return; } /* Must be called under sgv_pool_lock held */ static void sgv_dec_cached_entries(struct sgv_pool *pool, int pages) { pool->cached_entries--; pool->cached_pages -= pages; } /* Must be called under sgv_pool_lock held */ static void __sgv_purge_from_cache(struct sgv_pool_obj *obj) { int pages = obj->pages; struct sgv_pool *pool = obj->owner_pool; TRACE_MEM("Purging sgv obj %p from pool %p (new cached_entries %d)", obj, pool, pool->cached_entries-1); list_del(&obj->sorted_recycling_list_entry); list_del(&obj->recycling_list_entry); pool->inactive_cached_pages -= pages; sgv_dec_cached_entries(pool, pages); #ifndef CONFIG_SCST_NO_TOTAL_MEM_CHECKS atomic_sub(pages, &sgv_pages_total); #endif return; } /* Must be called under sgv_pool_lock held */ static bool sgv_purge_from_cache(struct sgv_pool_obj *obj, int min_interval, unsigned long cur_time) { EXTRACHECKS_BUG_ON(min_interval < 0); TRACE_MEM("Checking if sgv obj %p should be purged (cur time %ld, " "obj time %ld, time to purge %ld)", obj, cur_time, obj->time_stamp, obj->time_stamp + min_interval); if (time_after_eq(cur_time, (obj->time_stamp + min_interval))) { __sgv_purge_from_cache(obj); return true; } return false; } /* No locks */ static int sgv_shrink_pool(struct sgv_pool *pool, int nr, int min_interval, unsigned long cur_time, int *out_freed) { int freed = 0; TRACE_ENTRY(); TRACE_MEM("Trying to shrink pool %p (nr %d, min_interval %d)", pool, nr, min_interval); if (pool->purge_interval < 0) { TRACE_MEM("Not shrinkable pool %p, skipping", pool); goto out; } spin_lock_bh(&pool->sgv_pool_lock); while (!list_empty(&pool->sorted_recycling_list) && #ifdef CONFIG_SCST_NO_TOTAL_MEM_CHECKS true) { #else (atomic_read(&sgv_pages_total) > sgv_lo_wmk)) { #endif struct sgv_pool_obj *obj = list_first_entry( &pool->sorted_recycling_list, struct sgv_pool_obj, sorted_recycling_list_entry); if (sgv_purge_from_cache(obj, min_interval, cur_time)) { int pages = obj->pages; freed += pages; nr -= pages; TRACE_MEM("%d pages purged from pool %p (nr left %d, " "total freed %d)", pages, pool, nr, freed); spin_unlock_bh(&pool->sgv_pool_lock); sgv_dtor_and_free(obj); spin_lock_bh(&pool->sgv_pool_lock); } else break; if ((nr <= 0) || (freed >= MAX_PAGES_PER_POOL)) { if (freed >= MAX_PAGES_PER_POOL) TRACE_MEM("%d pages purged from pool %p, " "leaving", freed, pool); break; } } spin_unlock_bh(&pool->sgv_pool_lock); out: *out_freed += freed; TRACE_EXIT_RES(nr); return nr; } /* No locks */ static int __sgv_shrink(int nr, int min_interval, int *out_freed) { struct sgv_pool *pool; unsigned long cur_time = jiffies; int prev_nr = nr + 1; TRACE_ENTRY(); TRACE_MEM("Trying to shrink %d pages from all sgv pools " "(min_interval %d)", nr, min_interval); while (prev_nr > nr && nr > 0) { prev_nr = nr; rcu_read_lock(); list_for_each_entry_rcu(pool, &sgv_pools_list, sgv_pools_list_entry) { if (pool->cached_entries) nr = sgv_shrink_pool(pool, nr, min_interval, cur_time, out_freed); } rcu_read_unlock(); } TRACE_EXIT_RES(nr); return nr; } static unsigned long __sgv_can_be_shrunk(void) { unsigned long res; struct sgv_pool *pool; int inactive_pages = 0; TRACE_ENTRY(); spin_lock_bh(&sgv_pools_lock); list_for_each_entry(pool, &sgv_pools_list, sgv_pools_list_entry) { if (pool->purge_interval > 0) inactive_pages += pool->inactive_cached_pages; } spin_unlock_bh(&sgv_pools_lock); res = max(0, inactive_pages - sgv_lo_wmk); #ifndef CONFIG_SCST_NO_TOTAL_MEM_CHECKS TRACE_MEM("Can free %ld (total %d)", res, atomic_read(&sgv_pages_total)); #endif TRACE_EXIT_RES(res); return res; } #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 12, 0) static unsigned long sgv_can_be_shrunk(struct shrinker *shrinker, struct shrink_control *sc) { return __sgv_can_be_shrunk(); } static unsigned long sgv_scan_shrink(struct shrinker *shrinker, struct shrink_control *sc) { int freed = 0; TRACE_ENTRY(); __sgv_shrink(sc->nr_to_scan, SGV_MIN_SHRINK_INTERVAL, &freed); TRACE_MEM("Freed %d", freed); TRACE_EXIT_RES(freed); return freed; } #else /* if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 12, 0) */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 35) && (!defined(RHEL_MAJOR) || RHEL_MAJOR -0 < 6) static int sgv_shrink(int nr, gfp_t gfpm) #elif LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) static int sgv_shrink(struct shrinker *shrinker, int nr, gfp_t gfpm) #else static int sgv_shrink(struct shrinker *shrinker, struct shrink_control *sc) #endif { #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0) int nr = sc->nr_to_scan; #endif int freed = 0; TRACE_ENTRY(); if (nr > 0) { nr = __sgv_shrink(nr, SGV_MIN_SHRINK_INTERVAL, &freed); TRACE_MEM("Left %d", nr); } else nr = __sgv_can_be_shrunk(); TRACE_EXIT_RES(nr); return nr; } #endif /* if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 12, 0) */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20) static void sgv_purge_work_fn(void *p) #else static void sgv_purge_work_fn(struct work_struct *work) #endif { unsigned long cur_time = jiffies; #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20) struct sgv_pool *pool = (struct sgv_pool *)p; #else struct sgv_pool *pool = container_of(work, struct sgv_pool, sgv_purge_work.work); #endif TRACE_ENTRY(); TRACE_MEM("Purge work for pool %p", pool); spin_lock_bh(&pool->sgv_pool_lock); pool->purge_work_scheduled = false; while (!list_empty(&pool->sorted_recycling_list)) { struct sgv_pool_obj *obj = list_first_entry( &pool->sorted_recycling_list, struct sgv_pool_obj, sorted_recycling_list_entry); if (sgv_purge_from_cache(obj, pool->purge_interval, cur_time)) { spin_unlock_bh(&pool->sgv_pool_lock); sgv_dtor_and_free(obj); spin_lock_bh(&pool->sgv_pool_lock); } else { /* * Let's reschedule it for full period to not get here * too often. In the worst case we have shrinker * to reclaim buffers more quickly. */ TRACE_MEM("Rescheduling purge work for pool %p (delay " "%d HZ/%d sec)", pool, pool->purge_interval, pool->purge_interval/HZ); schedule_delayed_work(&pool->sgv_purge_work, pool->purge_interval); pool->purge_work_scheduled = true; break; } } spin_unlock_bh(&pool->sgv_pool_lock); TRACE_MEM("Leaving purge work for pool %p", pool); TRACE_EXIT(); return; } static int sgv_check_full_clustering(struct scatterlist *sg, int cur, int hint) { int res = -1; int i = hint; unsigned long pfn_cur = page_to_pfn(sg_page(&sg[cur])); int len_cur = sg[cur].length; unsigned long pfn_cur_next = pfn_cur + (len_cur >> PAGE_SHIFT); int full_page_cur = (len_cur & (PAGE_SIZE - 1)) == 0; unsigned long pfn, pfn_next; bool full_page; #if 0 TRACE_MEM("pfn_cur %ld, pfn_cur_next %ld, len_cur %d, full_page_cur %d", pfn_cur, pfn_cur_next, len_cur, full_page_cur); #endif /* check the hint first */ if (i >= 0) { pfn = page_to_pfn(sg_page(&sg[i])); pfn_next = pfn + (sg[i].length >> PAGE_SHIFT); full_page = (sg[i].length & (PAGE_SIZE - 1)) == 0; if ((pfn == pfn_cur_next) && full_page_cur) goto out_head; if ((pfn_next == pfn_cur) && full_page) goto out_tail; } /* ToDo: implement more intelligent search */ for (i = cur - 1; i >= 0; i--) { pfn = page_to_pfn(sg_page(&sg[i])); pfn_next = pfn + (sg[i].length >> PAGE_SHIFT); full_page = (sg[i].length & (PAGE_SIZE - 1)) == 0; if ((pfn == pfn_cur_next) && full_page_cur) goto out_head; if ((pfn_next == pfn_cur) && full_page) goto out_tail; } out: return res; out_tail: TRACE_MEM("SG segment %d will be tail merged with segment %d", cur, i); sg[i].length += len_cur; sg_clear(&sg[cur]); res = i; goto out; out_head: TRACE_MEM("SG segment %d will be head merged with segment %d", cur, i); sg_assign_page(&sg[i], sg_page(&sg[cur])); sg[i].length += len_cur; sg_clear(&sg[cur]); res = i; goto out; } static int sgv_check_tail_clustering(struct scatterlist *sg, int cur, int hint) { int res = -1; unsigned long pfn_cur = page_to_pfn(sg_page(&sg[cur])); int len_cur = sg[cur].length; int prev; unsigned long pfn_prev; bool full_page; #ifdef SCST_HIGHMEM if (page >= highmem_start_page) { TRACE_MEM("%s", "HIGHMEM page allocated, no clustering") goto out; } #endif #if 0 TRACE_MEM("pfn_cur %ld, pfn_cur_next %ld, len_cur %d, full_page_cur %d", pfn_cur, pfn_cur_next, len_cur, full_page_cur); #endif if (cur == 0) goto out; prev = cur - 1; pfn_prev = page_to_pfn(sg_page(&sg[prev])) + (sg[prev].length >> PAGE_SHIFT); full_page = (sg[prev].length & (PAGE_SIZE - 1)) == 0; if ((pfn_prev == pfn_cur) && full_page) { TRACE_MEM("SG segment %d will be tail merged with segment %d", cur, prev); sg[prev].length += len_cur; sg_clear(&sg[cur]); res = prev; } out: return res; } static void sgv_free_sys_sg_entries(struct scatterlist *sg, int sg_count, void *priv) { int i; TRACE_MEM("sg=%p, sg_count=%d", sg, sg_count); for (i = 0; i < sg_count; i++) { struct page *p = sg_page(&sg[i]); int len = sg[i].length; int pages = PAGE_ALIGN(len) >> PAGE_SHIFT; TRACE_MEM("page %lx, len %d, pages %d", (unsigned long)p, len, pages); while (pages > 0) { int order = 0; TRACE_MEM("free_pages(): order %d, page %lx", order, (unsigned long)p); __free_pages(p, order); pages -= 1 << order; p += 1 << order; } } } static struct page *sgv_alloc_sys_pages(struct scatterlist *sg, gfp_t gfp_mask, void *priv) { struct page *page = alloc_pages(gfp_mask, 0); sg_set_page(sg, page, PAGE_SIZE, 0); TRACE_MEM("page=%p, sg=%p, priv=%p", page, sg, priv); if (page == NULL) { TRACE(TRACE_OUT_OF_MEM, "%s", "Allocation of " "sg page failed"); } return page; } static int sgv_alloc_sg_entries(struct scatterlist *sg, int pages, gfp_t gfp_mask, enum sgv_clustering_types clustering_type, struct trans_tbl_ent *trans_tbl, const struct sgv_pool_alloc_fns *alloc_fns, void *priv) { int sg_count = 0; int pg, i, j; int merged = -1; TRACE_MEM("pages=%d, clustering_type=%d", pages, clustering_type); #if 0 gfp_mask |= __GFP_COLD; #endif #ifdef CONFIG_SCST_STRICT_SECURITY gfp_mask |= __GFP_ZERO; #endif for (pg = 0; pg < pages; pg++) { void *rc; #ifdef CONFIG_SCST_DEBUG_OOM if (((gfp_mask & __GFP_NOFAIL) != __GFP_NOFAIL) && ((scst_random() % 10000) == 55)) rc = NULL; else #endif rc = alloc_fns->alloc_pages_fn(&sg[sg_count], gfp_mask, priv); if (rc == NULL) goto out_no_mem; /* * This code allows compiler to see full body of the clustering * functions and gives it a chance to generate better code. * At least, the resulting code is smaller, comparing to * calling them using a function pointer. */ if (clustering_type == sgv_full_clustering) merged = sgv_check_full_clustering(sg, sg_count, merged); else if (clustering_type == sgv_tail_clustering) merged = sgv_check_tail_clustering(sg, sg_count, merged); else merged = -1; if (merged == -1) sg_count++; TRACE_MEM("pg=%d, merged=%d, sg_count=%d", pg, merged, sg_count); } if ((clustering_type != sgv_no_clustering) && (trans_tbl != NULL)) { pg = 0; for (i = 0; i < pages; i++) { int n = PAGE_ALIGN(sg[i].length) >> PAGE_SHIFT; trans_tbl[i].pg_count = pg; for (j = 0; j < n; j++) trans_tbl[pg++].sg_num = i+1; TRACE_MEM("i=%d, n=%d, pg_count=%d", i, n, trans_tbl[i].pg_count); } } out: TRACE_MEM("sg_count=%d", sg_count); return sg_count; out_no_mem: alloc_fns->free_pages_fn(sg, sg_count, priv); sg_count = 0; goto out; } static int sgv_alloc_arrays(struct sgv_pool_obj *obj, int pages_to_alloc, gfp_t gfp_mask) { int sz, tsz = 0; int res = 0; TRACE_ENTRY(); sz = pages_to_alloc * sizeof(obj->sg_entries[0]); obj->sg_entries = kmalloc(sz, gfp_mask); if (unlikely(obj->sg_entries == NULL)) { TRACE(TRACE_OUT_OF_MEM, "Allocation of sgv_pool_obj " "SG vector failed (size %d)", sz); res = -ENOMEM; goto out; } sg_init_table(obj->sg_entries, pages_to_alloc); if (sgv_pool_clustered(obj->owner_pool)) { if (pages_to_alloc <= sgv_max_trans_pages) { obj->trans_tbl = (struct trans_tbl_ent *)obj->sg_entries_data; /* * No need to clear trans_tbl, if needed, it will be * fully rewritten in sgv_alloc_sg_entries() */ } else { tsz = pages_to_alloc * sizeof(obj->trans_tbl[0]); obj->trans_tbl = kzalloc(tsz, gfp_mask); if (unlikely(obj->trans_tbl == NULL)) { TRACE(TRACE_OUT_OF_MEM, "Allocation of " "trans_tbl failed (size %d)", tsz); res = -ENOMEM; goto out_free; } } } TRACE_MEM("pages_to_alloc %d, sz %d, tsz %d, obj %p, sg_entries %p, " "trans_tbl %p", pages_to_alloc, sz, tsz, obj, obj->sg_entries, obj->trans_tbl); out: TRACE_EXIT_RES(res); return res; out_free: kfree(obj->sg_entries); obj->sg_entries = NULL; goto out; } static struct sgv_pool_obj *sgv_get_obj(struct sgv_pool *pool, int cache_num, int pages, gfp_t gfp_mask, bool get_new) { struct sgv_pool_obj *obj; spin_lock_bh(&pool->sgv_pool_lock); if (unlikely(get_new)) { /* Used only for buffers preallocation */ goto get_new; } if (likely(!list_empty(&pool->recycling_lists[cache_num]))) { obj = list_first_entry(&pool->recycling_lists[cache_num], struct sgv_pool_obj, recycling_list_entry); list_del(&obj->sorted_recycling_list_entry); list_del(&obj->recycling_list_entry); pool->inactive_cached_pages -= pages; spin_unlock_bh(&pool->sgv_pool_lock); goto out; } get_new: pool->cached_entries++; pool->cached_pages += pages; spin_unlock_bh(&pool->sgv_pool_lock); TRACE_MEM("New cached entries %d (pool %p)", pool->cached_entries, pool); obj = kmem_cache_alloc(pool->caches[cache_num], gfp_mask & ~(__GFP_HIGHMEM|GFP_DMA)); if (likely(obj)) { memset(obj, 0, sizeof(*obj)); obj->cache_num = cache_num; obj->pages = pages; obj->owner_pool = pool; } else { spin_lock_bh(&pool->sgv_pool_lock); sgv_dec_cached_entries(pool, pages); spin_unlock_bh(&pool->sgv_pool_lock); } out: return obj; } static void sgv_put_obj(struct sgv_pool_obj *obj) { struct sgv_pool *pool = obj->owner_pool; struct list_head *entry; struct list_head *list = &pool->recycling_lists[obj->cache_num]; int pages = obj->pages; spin_lock_bh(&pool->sgv_pool_lock); TRACE_MEM("sgv %p, cache num %d, pages %d, sg_count %d", obj, obj->cache_num, pages, obj->sg_count); if (sgv_pool_clustered(pool)) { /* Make objects with less entries more preferred */ __list_for_each(entry, list) { struct sgv_pool_obj *tmp = list_entry(entry, struct sgv_pool_obj, recycling_list_entry); TRACE_MEM("tmp %p, cache num %d, pages %d, sg_count %d", tmp, tmp->cache_num, tmp->pages, tmp->sg_count); if (obj->sg_count <= tmp->sg_count) break; } entry = entry->prev; } else entry = list; TRACE_MEM("Adding in %p (list %p)", entry, list); list_add(&obj->recycling_list_entry, entry); list_add_tail(&obj->sorted_recycling_list_entry, &pool->sorted_recycling_list); obj->time_stamp = jiffies; pool->inactive_cached_pages += pages; if (!pool->purge_work_scheduled) { TRACE_MEM("Scheduling purge work for pool %p", pool); pool->purge_work_scheduled = true; schedule_delayed_work(&pool->sgv_purge_work, pool->purge_interval); } spin_unlock_bh(&pool->sgv_pool_lock); return; } /* No locks */ static int sgv_hiwmk_check(int pages_to_alloc) { int res = 0; #ifndef CONFIG_SCST_NO_TOTAL_MEM_CHECKS int pages = pages_to_alloc; pages += atomic_read(&sgv_pages_total); if (unlikely(pages > sgv_hi_wmk)) { int freed = 0; pages -= sgv_hi_wmk; atomic_inc(&sgv_releases_on_hiwmk); pages = __sgv_shrink(pages, 0, &freed); if (pages > 0) { TRACE(TRACE_OUT_OF_MEM, "Requested amount of " "memory (%d pages) for being executed " "commands together with the already " "allocated memory exceeds the allowed " "maximum %d. Should you increase " "scst_max_cmd_mem?", pages_to_alloc, sgv_hi_wmk); atomic_inc(&sgv_releases_on_hiwmk_failed); res = -ENOMEM; goto out_unlock; } } atomic_add(pages_to_alloc, &sgv_pages_total); out_unlock: TRACE_MEM("pages_to_alloc %d, new total %d", pages_to_alloc, atomic_read(&sgv_pages_total)); #endif return res; } /* No locks */ static void sgv_hiwmk_uncheck(int pages) { #ifndef CONFIG_SCST_NO_TOTAL_MEM_CHECKS atomic_sub(pages, &sgv_pages_total); TRACE_MEM("pages %d, new total %d", pages, atomic_read(&sgv_pages_total)); #endif return; } /* No locks */ static bool sgv_check_allowed_mem(struct scst_mem_lim *mem_lim, int pages) { int alloced; bool res = true; alloced = atomic_add_return(pages, &mem_lim->alloced_pages); if (unlikely(alloced > mem_lim->max_allowed_pages)) { TRACE(TRACE_OUT_OF_MEM, "Requested amount of memory " "(%d pages) for being executed commands on a device " "together with the already allocated memory exceeds " "the allowed maximum %d. Should you increase " "scst_max_dev_cmd_mem?", pages, mem_lim->max_allowed_pages); atomic_sub(pages, &mem_lim->alloced_pages); res = false; } TRACE_MEM("mem_lim %p, pages %d, res %d, new alloced %d", mem_lim, pages, res, atomic_read(&mem_lim->alloced_pages)); return res; } /* No locks */ static void sgv_uncheck_allowed_mem(struct scst_mem_lim *mem_lim, int pages) { atomic_sub(pages, &mem_lim->alloced_pages); TRACE_MEM("mem_lim %p, pages %d, new alloced %d", mem_lim, pages, atomic_read(&mem_lim->alloced_pages)); return; } /** * sgv_pool_alloc - allocate an SG vector from the SGV pool * @pool: the cache to alloc from * @size: size of the resulting SG vector in bytes * @gfp_mask: the allocation mask * @flags: the allocation flags * @count: the resulting count of SG entries in the resulting SG vector * @sgv: the resulting SGV object * @mem_lim: memory limits * @priv: pointer to private for this allocation data * * Description: * Allocate an SG vector from the SGV pool and returns pointer to it or * NULL in case of any error. See the SGV pool documentation for more details. */ struct scatterlist *sgv_pool_alloc(struct sgv_pool *pool, unsigned int size, gfp_t gfp_mask, int flags, int *count, struct sgv_pool_obj **sgv, struct scst_mem_lim *mem_lim, void *priv) { struct sgv_pool_obj *obj; int cache_num, pages, cnt; struct scatterlist *res = NULL; int pages_to_alloc; int no_cached = flags & SGV_POOL_ALLOC_NO_CACHED; bool allowed_mem_checked = false, hiwmk_checked = false; TRACE_ENTRY(); if (unlikely(size == 0)) goto out; EXTRACHECKS_BUG_ON((gfp_mask & __GFP_NOFAIL) == __GFP_NOFAIL); pages = PAGE_ALIGN(size) >> PAGE_SHIFT; if (pool->single_alloc_pages == 0) { int pages_order = get_order(size); cache_num = pages_order; pages_to_alloc = (1 << pages_order); } else { cache_num = 0; pages_to_alloc = max(pool->single_alloc_pages, pages); } TRACE_MEM("size=%d, pages=%d, pages_to_alloc=%d, cache num=%d, " "flags=%x, no_cached=%d, *sgv=%p", size, pages, pages_to_alloc, cache_num, flags, no_cached, *sgv); if (*sgv != NULL) { obj = *sgv; TRACE_MEM("Supplied obj %p, cache num %d", obj, obj->cache_num); EXTRACHECKS_BUG_ON(obj->sg_count != 0); if (unlikely(!sgv_check_allowed_mem(mem_lim, pages_to_alloc))) goto out_fail_free_sg_entries; allowed_mem_checked = true; if (unlikely(sgv_hiwmk_check(pages_to_alloc) != 0)) goto out_fail_free_sg_entries; hiwmk_checked = true; } else if ((pages_to_alloc <= pool->max_cached_pages) && !no_cached) { if (unlikely(!sgv_check_allowed_mem(mem_lim, pages_to_alloc))) goto out_fail; allowed_mem_checked = true; obj = sgv_get_obj(pool, cache_num, pages_to_alloc, gfp_mask, flags & SGV_POOL_ALLOC_GET_NEW); if (unlikely(obj == NULL)) { TRACE(TRACE_OUT_OF_MEM, "Allocation of " "sgv_pool_obj failed (size %d)", size); goto out_fail; } if (obj->sg_count != 0) { TRACE_MEM("Cached obj %p", obj); atomic_inc(&pool->cache_acc[cache_num].hit_alloc); goto success; } if (flags & SGV_POOL_NO_ALLOC_ON_CACHE_MISS) { if (!(flags & SGV_POOL_RETURN_OBJ_ON_ALLOC_FAIL)) goto out_fail_free; } if (likely(!obj->recycling_list_entry.next)) { TRACE_MEM("Brand new obj %p", obj); } else if (unlikely(obj->sg_entries != obj->sg_entries_data)) { TRACE_MEM("Cached obj %p with sg_count == 0", obj); kfree(obj->sg_entries); obj->sg_entries = NULL; } if (pages_to_alloc <= sgv_max_local_pages) { obj->sg_entries = obj->sg_entries_data; sg_init_table(obj->sg_entries, pages_to_alloc); TRACE_MEM("sg_entries %p", obj->sg_entries); if (sgv_pool_clustered(pool)) { obj->trans_tbl = (struct trans_tbl_ent *) (obj->sg_entries + pages_to_alloc); TRACE_MEM("trans_tbl %p", obj->trans_tbl); /* * No need to clear trans_tbl, if needed, it * will be fully rewritten in * sgv_alloc_sg_entries(). */ } } else { if (unlikely(sgv_alloc_arrays(obj, pages_to_alloc, gfp_mask) != 0)) goto out_fail_free; } if ((flags & SGV_POOL_NO_ALLOC_ON_CACHE_MISS) && (flags & SGV_POOL_RETURN_OBJ_ON_ALLOC_FAIL)) goto out_return; obj->allocator_priv = priv; if (unlikely(sgv_hiwmk_check(pages_to_alloc) != 0)) goto out_fail_free_sg_entries; hiwmk_checked = true; } else { int sz; pages_to_alloc = pages; if (unlikely(!sgv_check_allowed_mem(mem_lim, pages_to_alloc))) goto out_fail; allowed_mem_checked = true; if (flags & SGV_POOL_NO_ALLOC_ON_CACHE_MISS) goto out_return2; sz = sizeof(*obj) + pages * sizeof(obj->sg_entries[0]); obj = kmalloc(sz, gfp_mask); if (unlikely(obj == NULL)) { TRACE(TRACE_OUT_OF_MEM, "Allocation of " "sgv_pool_obj failed (size %d)", size); goto out_fail; } memset(obj, 0, sizeof(*obj)); obj->owner_pool = pool; cache_num = -1; obj->cache_num = cache_num; obj->pages = pages_to_alloc; obj->allocator_priv = priv; obj->sg_entries = obj->sg_entries_data; sg_init_table(obj->sg_entries, pages); if (unlikely(sgv_hiwmk_check(pages_to_alloc) != 0)) goto out_fail_free_sg_entries; hiwmk_checked = true; TRACE_MEM("Big or no_cached obj %p (size %d)", obj, sz); } obj->sg_count = sgv_alloc_sg_entries(obj->sg_entries, pages_to_alloc, gfp_mask, pool->clustering_type, obj->trans_tbl, &pool->alloc_fns, priv); if (unlikely(obj->sg_count <= 0)) { obj->sg_count = 0; if ((flags & SGV_POOL_RETURN_OBJ_ON_ALLOC_FAIL) && (cache_num >= 0)) goto out_return1; else goto out_fail_free_sg_entries; } if (cache_num >= 0) { atomic_add(pages_to_alloc - obj->sg_count, &pool->cache_acc[cache_num].merged); } else { if (no_cached) { atomic_add(pages_to_alloc, &pool->other_pages); atomic_add(pages_to_alloc - obj->sg_count, &pool->other_merged); } else { atomic_add(pages_to_alloc, &pool->big_pages); atomic_add(pages_to_alloc - obj->sg_count, &pool->big_merged); } } success: if (cache_num >= 0) { int sg; atomic_inc(&pool->cache_acc[cache_num].total_alloc); if (sgv_pool_clustered(pool)) cnt = obj->trans_tbl[pages-1].sg_num; else cnt = pages; sg = cnt-1; obj->orig_sg = sg; obj->orig_length = obj->sg_entries[sg].length; if (sgv_pool_clustered(pool)) { obj->sg_entries[sg].length = (pages - obj->trans_tbl[sg].pg_count) << PAGE_SHIFT; } } else { cnt = obj->sg_count; if (no_cached) atomic_inc(&pool->other_alloc); else atomic_inc(&pool->big_alloc); } *count = cnt; res = obj->sg_entries; *sgv = obj; obj->sg_entries[cnt-1].length -= PAGE_ALIGN(size) - size; sg_mark_end(&obj->sg_entries[cnt-1]); TRACE_MEM("obj=%p, sg_entries %p (size=%d, pages=%d, sg_count=%d, " "count=%d, last_len=%d)", obj, obj->sg_entries, size, pages, obj->sg_count, *count, obj->sg_entries[obj->orig_sg].length); out: TRACE_EXIT_HRES(res); return res; out_return: obj->allocator_priv = priv; obj->owner_pool = pool; out_return1: *sgv = obj; TRACE_MEM("Returning failed obj %p", obj); out_return2: *count = pages_to_alloc; res = NULL; goto out_uncheck; out_fail_free_sg_entries: if (obj->sg_entries != obj->sg_entries_data) { if (obj->trans_tbl != (struct trans_tbl_ent *)obj->sg_entries_data) { /* kfree() handles NULL parameter */ kfree(obj->trans_tbl); obj->trans_tbl = NULL; } kfree(obj->sg_entries); obj->sg_entries = NULL; } out_fail_free: if (cache_num >= 0) { spin_lock_bh(&pool->sgv_pool_lock); sgv_dec_cached_entries(pool, pages_to_alloc); spin_unlock_bh(&pool->sgv_pool_lock); kmem_cache_free(pool->caches[obj->cache_num], obj); } else kfree(obj); out_fail: res = NULL; *count = 0; *sgv = NULL; TRACE_MEM("%s", "Allocation failed"); out_uncheck: if (hiwmk_checked) sgv_hiwmk_uncheck(pages_to_alloc); if (allowed_mem_checked) sgv_uncheck_allowed_mem(mem_lim, pages_to_alloc); goto out; } EXPORT_SYMBOL_GPL(sgv_pool_alloc); /* * sgv_get_priv - return the private allocation data * * Allows to get the allocation private data for this SGV * cache object. The private data supposed to be set by sgv_pool_alloc(). */ void *sgv_get_priv(struct sgv_pool_obj *obj) { return obj->allocator_priv; } EXPORT_SYMBOL_GPL(sgv_get_priv); /** * sgv_pool_free - free previously allocated SG vector * @obj: the SGV object to free * @mem_lim: memory limits * * Description: * Frees previously allocated SG vector and updates memory limits */ void sgv_pool_free(struct sgv_pool_obj *obj, struct scst_mem_lim *mem_lim) { int pages = (obj->sg_count != 0) ? obj->pages : 0; TRACE_MEM("Freeing obj %p, cache num %d, pages %d, sg_entries %p, " "sg_count %d, allocator_priv %p", obj, obj->cache_num, pages, obj->sg_entries, obj->sg_count, obj->allocator_priv); /* * Enable it if you are investigating a data corruption and want to make * sure that target or dev handler didn't leave the pages mapped somewhere and, * hence, provoked a data corruption. * * Make sure the check value for _count is set correctly. In most cases, 1 is * correct, but, e.g., iSCSI-SCST can call it with value 2, because * it frees the corresponding cmd before the last put_page() call from * net_put_page() for the last page in the SG. Also, user space dev handlers * usually have their memory mapped in their address space. */ #if 0 { struct scatterlist *sg = obj->sg_entries; int i; for (i = 0; i < obj->sg_count; i++) { struct page *p = sg_page(&sg[i]); int len = sg[i].length; int pages = PAGE_ALIGN(len) >> PAGE_SHIFT; while (pages > 0) { if (page_count(p) != 1) { PRINT_WARNING("Freeing page %p with " "additional owners (_count %d). " "Data corruption possible!", p, page_count(p)); WARN_ON(1); } pages--; p++; } } } #endif if (obj->cache_num >= 0) { obj->sg_entries[obj->orig_sg].length = obj->orig_length; sg_unmark_end(&obj->sg_entries[obj->orig_sg]); sgv_put_obj(obj); } else { obj->owner_pool->alloc_fns.free_pages_fn(obj->sg_entries, obj->sg_count, obj->allocator_priv); kfree(obj); sgv_hiwmk_uncheck(pages); } sgv_uncheck_allowed_mem(mem_lim, pages); return; } EXPORT_SYMBOL_GPL(sgv_pool_free); /* * scst_alloc_sg() - allocates an SG vector * * Allocates and returns pointer to SG vector with data size "size". * In *count returned the count of entries in the vector. * Returns NULL for failure. * * Please don't use it for massive commands data buffers, because it * isn't fair and don't account per device memory limits. Use sgv_pool_alloc() * instead. */ struct scatterlist *scst_alloc_sg(int size, gfp_t gfp_mask, int *count) { struct scatterlist *res; int pages = PAGE_ALIGN(size) >> PAGE_SHIFT; struct sgv_pool_alloc_fns sys_alloc_fns = { sgv_alloc_sys_pages, sgv_free_sys_sg_entries }; int no_fail = ((gfp_mask & __GFP_NOFAIL) == __GFP_NOFAIL); int cnt; TRACE_ENTRY(); #ifndef CONFIG_SCST_NO_TOTAL_MEM_CHECKS atomic_inc(&sgv_other_total_alloc); #endif if (unlikely(sgv_hiwmk_check(pages) != 0)) { if (!no_fail) { res = NULL; goto out; } else { /* * Update active_pages_total since alloc can't fail. * If it wasn't updated then the counter would cross 0 * on free again. */ sgv_hiwmk_uncheck(-pages); } } res = kmalloc_array(pages, sizeof(*res), gfp_mask); if (res == NULL) { TRACE(TRACE_OUT_OF_MEM, "Unable to allocate sg for %d pages", pages); goto out_uncheck; } sg_init_table(res, pages); /* * If we allow use clustering here, we will have troubles in * scst_free_sg() to figure out how many pages are in the SG vector. * So, let's always don't use clustering. */ cnt = sgv_alloc_sg_entries(res, pages, gfp_mask, sgv_no_clustering, NULL, &sys_alloc_fns, NULL); if (cnt <= 0) goto out_free; res[cnt-1].length -= PAGE_ALIGN(size) - size; *count = cnt; out: TRACE_MEM("Alloced sg %p (count %d, no_fail %d)", res, *count, no_fail); TRACE_EXIT_HRES(res); return res; out_free: kfree(res); res = NULL; out_uncheck: if (!no_fail) sgv_hiwmk_uncheck(pages); goto out; } EXPORT_SYMBOL_GPL(scst_alloc_sg); /* * scst_free_sg() - frees SG vector * * Frees SG vector returned by scst_alloc_sg(). */ void scst_free_sg(struct scatterlist *sg, int count) { TRACE_MEM("Freeing sg=%p", sg); sgv_hiwmk_uncheck(count); sgv_free_sys_sg_entries(sg, count, NULL); kfree(sg); return; } EXPORT_SYMBOL_GPL(scst_free_sg); /* Must be called under sgv_pools_mutex */ static void sgv_pool_init_cache(struct sgv_pool *pool, int cache_num, bool per_cpu) { int size; int pages; struct sgv_pool_obj *obj; atomic_set(&pool->cache_acc[cache_num].total_alloc, 0); atomic_set(&pool->cache_acc[cache_num].hit_alloc, 0); atomic_set(&pool->cache_acc[cache_num].merged, 0); if (pool->single_alloc_pages == 0) pages = 1 << cache_num; else pages = pool->single_alloc_pages; if (pages <= sgv_max_local_pages) { size = sizeof(*obj) + pages * (sizeof(obj->sg_entries[0]) + ((pool->clustering_type != sgv_no_clustering) ? sizeof(obj->trans_tbl[0]) : 0)); } else if (pages <= sgv_max_trans_pages) { /* * sg_entries is allocated outside object, * but trans_tbl is still embedded. */ size = sizeof(*obj) + pages * (((pool->clustering_type != sgv_no_clustering) ? sizeof(obj->trans_tbl[0]) : 0)); } else { size = sizeof(*obj); /* both sgv and trans_tbl are kmalloc'ed() */ } TRACE_MEM("pages=%d, size=%d (per cpu %d)", pages, size, per_cpu); scnprintf(pool->cache_names[cache_num], sizeof(pool->cache_names[cache_num]), "%s-%uK", pool->name, (pages << PAGE_SHIFT) >> 10); pool->caches[cache_num] = kmem_cache_create( pool->cache_names[cache_num], size, 0, per_cpu ? SCST_SLAB_FLAGS : (SCST_SLAB_FLAGS|SLAB_HWCACHE_ALIGN), NULL #if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23)) , NULL); #else ); #endif return; } /* Must be called under sgv_pools_mutex */ static int sgv_pool_init(struct sgv_pool *pool, const char *name, enum sgv_clustering_types clustering_type, int single_alloc_pages, int purge_interval, bool per_cpu) { int res = -ENOMEM; int i; TRACE_ENTRY(); if (single_alloc_pages < 0) { PRINT_ERROR("Wrong single_alloc_pages value %d", single_alloc_pages); res = -EINVAL; goto out; } memset(pool, 0, sizeof(*pool)); atomic_set(&pool->big_alloc, 0); atomic_set(&pool->big_pages, 0); atomic_set(&pool->big_merged, 0); atomic_set(&pool->other_alloc, 0); atomic_set(&pool->other_pages, 0); atomic_set(&pool->other_merged, 0); pool->clustering_type = clustering_type; pool->single_alloc_pages = single_alloc_pages; if (purge_interval != 0) { pool->purge_interval = purge_interval; if (purge_interval < 0) { /* Let's pretend that it's always scheduled */ pool->purge_work_scheduled = 1; } } else pool->purge_interval = SGV_DEFAULT_PURGE_INTERVAL; if (single_alloc_pages == 0) { pool->max_caches = SGV_POOL_ELEMENTS; pool->max_cached_pages = 1 << (SGV_POOL_ELEMENTS - 1); } else { pool->max_caches = 1; pool->max_cached_pages = single_alloc_pages; } pool->alloc_fns.alloc_pages_fn = sgv_alloc_sys_pages; pool->alloc_fns.free_pages_fn = sgv_free_sys_sg_entries; TRACE_MEM("name %s, sizeof(*obj)=%zd, clustering_type=%d, " "single_alloc_pages=%d, max_caches=%d, max_cached_pages=%d", name, sizeof(struct sgv_pool_obj), clustering_type, single_alloc_pages, pool->max_caches, pool->max_cached_pages); strlcpy(pool->name, name, sizeof(pool->name)-1); pool->owner_mm = current->mm; for (i = 0; i < pool->max_caches; i++) { sgv_pool_init_cache(pool, i, per_cpu); if (pool->caches[i] == NULL) { PRINT_ERROR("Allocation of sgv_pool " "cache %s(%d) failed", name, i); goto out_free; } } atomic_set(&pool->sgv_pool_ref, 1); spin_lock_init(&pool->sgv_pool_lock); INIT_LIST_HEAD(&pool->sorted_recycling_list); for (i = 0; i < pool->max_caches; i++) INIT_LIST_HEAD(&pool->recycling_lists[i]); #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 20)) INIT_DELAYED_WORK(&pool->sgv_purge_work, sgv_purge_work_fn); #else INIT_WORK(&pool->sgv_purge_work, sgv_purge_work_fn, pool); #endif spin_lock_bh(&sgv_pools_lock); list_add_tail(&pool->sgv_pools_list_entry, &sgv_pools_list); spin_unlock_bh(&sgv_pools_lock); res = scst_sgv_sysfs_create(pool); if (res != 0) goto out_del; res = 0; out: TRACE_EXIT_RES(res); return res; out_del: spin_lock_bh(&sgv_pools_lock); list_del(&pool->sgv_pools_list_entry); spin_unlock_bh(&sgv_pools_lock); synchronize_rcu(); out_free: for (i = 0; i < pool->max_caches; i++) { if (pool->caches[i]) { kmem_cache_destroy(pool->caches[i]); pool->caches[i] = NULL; } else break; } goto out; } static void sgv_evaluate_local_max_pages(void) { int space4sgv_ttbl = PAGE_SIZE - sizeof(struct sgv_pool_obj); sgv_max_local_pages = space4sgv_ttbl / (sizeof(struct trans_tbl_ent) + sizeof(struct scatterlist)); sgv_max_trans_pages = space4sgv_ttbl / sizeof(struct trans_tbl_ent); TRACE_MEM("sgv_max_local_pages %d, sgv_max_trans_pages %d", sgv_max_local_pages, sgv_max_trans_pages); return; } /* * sgv_pool_flush() - flushes the SGV pool. * * Flushes, i.e. frees, all the cached entries in the SGV pool. */ void sgv_pool_flush(struct sgv_pool *pool) { int i; TRACE_ENTRY(); for (i = 0; i < pool->max_caches; i++) { struct sgv_pool_obj *obj; spin_lock_bh(&pool->sgv_pool_lock); while (!list_empty(&pool->recycling_lists[i])) { obj = list_first_entry(&pool->recycling_lists[i], struct sgv_pool_obj, recycling_list_entry); __sgv_purge_from_cache(obj); spin_unlock_bh(&pool->sgv_pool_lock); EXTRACHECKS_BUG_ON(obj->owner_pool != pool); sgv_dtor_and_free(obj); spin_lock_bh(&pool->sgv_pool_lock); } spin_unlock_bh(&pool->sgv_pool_lock); } TRACE_EXIT(); return; } EXPORT_SYMBOL_GPL(sgv_pool_flush); static void sgv_pool_destroy(struct sgv_pool *pool) { int i; TRACE_ENTRY(); sgv_pool_flush(pool); mutex_lock(&sgv_pools_mutex); spin_lock_bh(&sgv_pools_lock); list_del(&pool->sgv_pools_list_entry); spin_unlock_bh(&sgv_pools_lock); mutex_unlock(&sgv_pools_mutex); synchronize_rcu(); scst_sgv_sysfs_del(pool); cancel_delayed_work_sync(&pool->sgv_purge_work); for (i = 0; i < pool->max_caches; i++) { if (pool->caches[i]) kmem_cache_destroy(pool->caches[i]); pool->caches[i] = NULL; } kmem_cache_free(sgv_pool_cachep, pool); TRACE_EXIT(); return; } /** * sgv_pool_set_allocator - set custom pages allocator * @pool: the cache * @alloc_pages_fn: pages allocation function * @free_pages_fn: pages freeing function * * Description: * Allows to set custom pages allocator for the SGV pool. * See the SGV pool documentation for more details. */ void sgv_pool_set_allocator(struct sgv_pool *pool, struct page *(*alloc_pages_fn)(struct scatterlist *, gfp_t, void *), void (*free_pages_fn)(struct scatterlist *, int, void *)) { pool->alloc_fns.alloc_pages_fn = alloc_pages_fn; pool->alloc_fns.free_pages_fn = free_pages_fn; return; } EXPORT_SYMBOL_GPL(sgv_pool_set_allocator); /** * sgv_pool_create_node - creates and initializes an SGV pool * @name: the name of the SGV pool * @clustering_type: sets type of the pages clustering. * @single_alloc_pages: if 0, then the SGV pool will work in the set of * power 2 size buffers mode. If >0, then the SGV pool will * work in the fixed size buffers mode. In this case * single_alloc_pages sets the size of each buffer in pages. * @shared: sets if the SGV pool can be shared between devices or not. * The cache sharing allowed only between devices created inside * the same address space. If an SGV pool is shared, each * subsequent call of sgv_pool_create*() with the same cache name * will not create a new cache, but instead return a reference * to it. * @purge_interval: sets the cache purging interval. I.e., an SG buffer * will be freed if it's unused for time t * purge_interval <= t < 2*purge_interval. If purge_interval * is 0, then the default interval will be used (60 seconds). * If purge_interval <0, then the automatic purging will be * disabled. In HZ. * @nodeid: NUMA node for this pool. Can be NUMA_NO_NODE, if the * caller doesn't care. * * Description: * Returns the resulting SGV pool or NULL in case of any error. */ struct sgv_pool *sgv_pool_create_node(const char *name, enum sgv_clustering_types clustering_type, int single_alloc_pages, bool shared, int purge_interval, int nodeid) { struct sgv_pool *pool, *tp; int rc; TRACE_ENTRY(); TRACE_MEM("Creating pool %s (clustering_type %d, " "single_alloc_pages %d, shared %d, purge_interval %d, " "nodeid %d)", name, clustering_type, single_alloc_pages, shared, purge_interval, nodeid); /* * __sgv_shrink() takes sgv_pools_mutex, so we have to play tricks to * prevent deadlock with it if this allocation will try to reclaim memory */ pool = kmem_cache_alloc_node(sgv_pool_cachep, GFP_KERNEL, nodeid); if (pool == NULL) { PRINT_ERROR("Allocation of sgv_pool failed (size %zd)", sizeof(*pool)); goto out; } memset(pool, 0, sizeof(*pool)); mutex_lock(&sgv_pools_mutex); list_for_each_entry(tp, &sgv_pools_list, sgv_pools_list_entry) { if (strcmp(tp->name, name) == 0) { if (shared) { if (tp->owner_mm != current->mm) { PRINT_ERROR("Attempt of a shared use " "of SGV pool %s with " "different MM", name); goto out_free; } sgv_pool_get(tp); goto out_free; } else { PRINT_ERROR("SGV pool %s already exists", name); tp = NULL; goto out_free; } } } tp = NULL; rc = sgv_pool_init(pool, name, clustering_type, single_alloc_pages, purge_interval, nodeid != NUMA_NO_NODE); if (rc != 0) goto out_free; out_unlock: mutex_unlock(&sgv_pools_mutex); out: TRACE_EXIT_RES(pool != NULL); return pool; out_free: kmem_cache_free(sgv_pool_cachep, pool); pool = tp; goto out_unlock; } EXPORT_SYMBOL_GPL(sgv_pool_create_node); /* * sgv_pool_get - increase ref counter for the corresponding SGV pool * * Increases ref counter for the corresponding SGV pool */ void sgv_pool_get(struct sgv_pool *pool) { atomic_inc(&pool->sgv_pool_ref); TRACE_MEM("Incrementing sgv pool %p ref (new value %d)", pool, atomic_read(&pool->sgv_pool_ref)); return; } EXPORT_SYMBOL_GPL(sgv_pool_get); /* * sgv_pool_put - decrease ref counter for the corresponding SGV pool * * Decreases ref counter for the corresponding SGV pool. If the ref * counter reaches 0, the cache will be destroyed. */ void sgv_pool_put(struct sgv_pool *pool) { TRACE_MEM("Decrementing sgv pool %p ref (new value %d)", pool, atomic_read(&pool->sgv_pool_ref)-1); if (atomic_dec_and_test(&pool->sgv_pool_ref)) sgv_pool_destroy(pool); return; } EXPORT_SYMBOL_GPL(sgv_pool_put); /** * sgv_pool_del - deletes the corresponding SGV pool * @pool: the cache to delete. * * Description: * If the cache is shared, it will decrease its reference counter. * If the reference counter reaches 0, the cache will be destroyed. */ void sgv_pool_del(struct sgv_pool *pool) { TRACE_ENTRY(); sgv_pool_put(pool); TRACE_EXIT(); return; } EXPORT_SYMBOL_GPL(sgv_pool_del); /* Both parameters in pages */ int scst_sgv_pools_init(unsigned long mem_hwmark, unsigned long mem_lwmark) { int res = 0, i; TRACE_ENTRY(); sgv_pool_cachep = KMEM_CACHE(sgv_pool, SCST_SLAB_FLAGS|SLAB_HWCACHE_ALIGN); if (sgv_pool_cachep == NULL) goto out_err; sgv_hi_wmk = mem_hwmark; sgv_lo_wmk = mem_lwmark; sgv_evaluate_local_max_pages(); sgv_norm_pool_main = sgv_pool_create("sgv", sgv_no_clustering, 0, false, 0); if (sgv_norm_pool_main == NULL) goto out_free_pool; sgv_norm_clust_pool_main = sgv_pool_create("sgv-clust", sgv_full_clustering, 0, false, 0); if (sgv_norm_clust_pool_main == NULL) goto out_free_norm; sgv_dma_pool_main = sgv_pool_create("sgv-dma", sgv_no_clustering, 0, false, 0); if (sgv_dma_pool_main == NULL) goto out_free_clust; /* * ToDo: not compatible with CPU hotplug! Notification * callbacks must be installed! */ for (i = 0; i < nr_cpu_ids; i++) sgv_norm_pool_global[i] = sgv_norm_pool_main; for (i = 0; i < nr_cpu_ids; i++) sgv_norm_clust_pool_global[i] = sgv_norm_clust_pool_main; for (i = 0; i < nr_cpu_ids; i++) sgv_dma_pool_global[i] = sgv_dma_pool_main; for (i = 0; i < nr_cpu_ids; i++) { char name[60]; if (!cpu_online(i)) continue; scnprintf(name, sizeof(name), "sgv-%d", i); sgv_norm_pool_per_cpu[i] = sgv_pool_create_node(name, sgv_no_clustering, 0, false, 0, cpu_to_node(i)); if (sgv_norm_pool_per_cpu[i] == NULL) goto out_free_per_cpu_norm; } for (i = 0; i < nr_cpu_ids; i++) { char name[60]; if (!cpu_online(i)) continue; scnprintf(name, sizeof(name), "sgv-clust-%d", i); sgv_norm_clust_pool_per_cpu[i] = sgv_pool_create_node(name, sgv_full_clustering, 0, false, 0, cpu_to_node(i)); if (sgv_norm_clust_pool_per_cpu[i] == NULL) goto out_free_per_cpu_clust; } for (i = 0; i < nr_cpu_ids; i++) { char name[60]; if (!cpu_online(i)) continue; scnprintf(name, sizeof(name), "sgv-dma-%d", i); sgv_dma_pool_per_cpu[i] = sgv_pool_create_node(name, sgv_no_clustering, 0, false, 0, cpu_to_node(i)); if (sgv_dma_pool_per_cpu[i] == NULL) goto out_free_per_cpu_dma; } #if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23)) sgv_shrinker = set_shrinker(DEFAULT_SEEKS, sgv_shrink); #else #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 12, 0) sgv_shrinker.count_objects = sgv_can_be_shrunk; sgv_shrinker.scan_objects = sgv_scan_shrink; #else sgv_shrinker.shrink = sgv_shrink; #endif sgv_shrinker.seeks = DEFAULT_SEEKS; register_shrinker(&sgv_shrinker); #endif out: TRACE_EXIT_RES(res); return res; out_free_per_cpu_dma: for (i = 0; i < nr_cpu_ids; i++) if (sgv_dma_pool_per_cpu[i] != NULL) sgv_pool_destroy(sgv_dma_pool_per_cpu[i]); out_free_per_cpu_clust: for (i = 0; i < nr_cpu_ids; i++) if (sgv_norm_clust_pool_per_cpu[i] != NULL) sgv_pool_destroy(sgv_norm_clust_pool_per_cpu[i]); out_free_per_cpu_norm: for (i = 0; i < nr_cpu_ids; i++) if (sgv_norm_pool_per_cpu[i] != NULL) sgv_pool_destroy(sgv_norm_pool_per_cpu[i]); sgv_pool_destroy(sgv_dma_pool_main); out_free_clust: sgv_pool_destroy(sgv_norm_clust_pool_main); out_free_norm: sgv_pool_destroy(sgv_norm_pool_main); out_free_pool: kmem_cache_destroy(sgv_pool_cachep); out_err: res = -ENOMEM; goto out; } void scst_sgv_pools_deinit(void) { int i; TRACE_ENTRY(); #if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23)) remove_shrinker(sgv_shrinker); #else unregister_shrinker(&sgv_shrinker); #endif sgv_pool_destroy(sgv_dma_pool_main); for (i = 0; i < nr_cpu_ids; i++) if (sgv_dma_pool_per_cpu[i] != NULL) sgv_pool_destroy(sgv_dma_pool_per_cpu[i]); sgv_pool_destroy(sgv_norm_pool_main); for (i = 0; i < nr_cpu_ids; i++) if (sgv_norm_pool_per_cpu[i] != NULL) sgv_pool_destroy(sgv_norm_pool_per_cpu[i]); sgv_pool_destroy(sgv_norm_clust_pool_main); for (i = 0; i < nr_cpu_ids; i++) if (sgv_norm_clust_pool_per_cpu[i] != NULL) sgv_pool_destroy(sgv_norm_clust_pool_per_cpu[i]); for (i = 0; i < nr_cpu_ids; i++) sgv_norm_pool_global[i] = NULL; for (i = 0; i < nr_cpu_ids; i++) sgv_norm_clust_pool_global[i] = NULL; for (i = 0; i < nr_cpu_ids; i++) sgv_dma_pool_global[i] = NULL; kmem_cache_destroy(sgv_pool_cachep); TRACE_EXIT(); return; } static ssize_t sgv_sysfs_stat_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct sgv_pool *pool; int i, total = 0, hit = 0, merged = 0, allocated = 0; int oa, om, res; pool = container_of(kobj, struct sgv_pool, sgv_kobj); for (i = 0; i < SGV_POOL_ELEMENTS; i++) { int t; hit += atomic_read(&pool->cache_acc[i].hit_alloc); total += atomic_read(&pool->cache_acc[i].total_alloc); t = atomic_read(&pool->cache_acc[i].total_alloc) - atomic_read(&pool->cache_acc[i].hit_alloc); allocated += t * (1 << i); merged += atomic_read(&pool->cache_acc[i].merged); } res = sprintf(buf, "%-30s %-11s %-11s %-11s %-11s", "Name", "Hit", "Total", "% merged", "Cached (P/I/O)"); res += sprintf(&buf[res], "\n%-30s %-11d %-11d %-11d %d/%d/%d\n", pool->name, hit, total, (allocated != 0) ? merged*100/allocated : 0, pool->cached_pages, pool->inactive_cached_pages, pool->cached_entries); for (i = 0; i < SGV_POOL_ELEMENTS; i++) { int t = atomic_read(&pool->cache_acc[i].total_alloc) - atomic_read(&pool->cache_acc[i].hit_alloc); allocated = t * (1 << i); merged = atomic_read(&pool->cache_acc[i].merged); res += sprintf(&buf[res], " %-28s %-11d %-11d %d\n", pool->cache_names[i], atomic_read(&pool->cache_acc[i].hit_alloc), atomic_read(&pool->cache_acc[i].total_alloc), (allocated != 0) ? merged*100/allocated : 0); } allocated = atomic_read(&pool->big_pages); merged = atomic_read(&pool->big_merged); oa = atomic_read(&pool->other_pages); om = atomic_read(&pool->other_merged); res += sprintf(&buf[res], " %-40s %d/%-9d %d/%d\n", "big/other", atomic_read(&pool->big_alloc), atomic_read(&pool->other_alloc), (allocated != 0) ? merged*100/allocated : 0, (oa != 0) ? om/oa : 0); return res; } static ssize_t sgv_sysfs_stat_reset(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct sgv_pool *pool; int i; TRACE_ENTRY(); pool = container_of(kobj, struct sgv_pool, sgv_kobj); for (i = 0; i < SGV_POOL_ELEMENTS; i++) { atomic_set(&pool->cache_acc[i].hit_alloc, 0); atomic_set(&pool->cache_acc[i].total_alloc, 0); atomic_set(&pool->cache_acc[i].merged, 0); } atomic_set(&pool->big_pages, 0); atomic_set(&pool->big_merged, 0); atomic_set(&pool->big_alloc, 0); atomic_set(&pool->other_pages, 0); atomic_set(&pool->other_merged, 0); atomic_set(&pool->other_alloc, 0); PRINT_INFO("Statistics for SGV pool %s reset", pool->name); TRACE_EXIT_RES(count); return count; } static ssize_t sgv_sysfs_global_stat_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct sgv_pool *pool; int inactive_pages = 0, res; TRACE_ENTRY(); spin_lock_bh(&sgv_pools_lock); list_for_each_entry(pool, &sgv_pools_list, sgv_pools_list_entry) { inactive_pages += pool->inactive_cached_pages; } spin_unlock_bh(&sgv_pools_lock); #ifdef CONFIG_SCST_NO_TOTAL_MEM_CHECKS res = sprintf(buf, "%-42s %d\n", "Inactive pages", inactive_pages); #else res = sprintf(buf, "%-42s %d/%d\n%-42s %d/%d\n%-42s %d/%d\n" "%-42s %-11d\n", "Inactive/active pages", inactive_pages, atomic_read(&sgv_pages_total) - inactive_pages, "Hi/lo watermarks [pages]", sgv_hi_wmk, sgv_lo_wmk, "Hi watermark releases/failures", atomic_read(&sgv_releases_on_hiwmk), atomic_read(&sgv_releases_on_hiwmk_failed), "Other allocs", atomic_read(&sgv_other_total_alloc)); #endif TRACE_EXIT(); return res; } static ssize_t sgv_sysfs_global_stat_reset(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { TRACE_ENTRY(); atomic_set(&sgv_releases_on_hiwmk, 0); atomic_set(&sgv_releases_on_hiwmk_failed, 0); #ifndef CONFIG_SCST_NO_TOTAL_MEM_CHECKS atomic_set(&sgv_other_total_alloc, 0); #endif PRINT_INFO("%s", "Global SGV pool statistics reset"); TRACE_EXIT_RES(count); return count; } static struct kobj_attribute sgv_stat_attr = __ATTR(stats, S_IRUGO | S_IWUSR, sgv_sysfs_stat_show, sgv_sysfs_stat_reset); static struct attribute *sgv_attrs[] = { &sgv_stat_attr.attr, NULL, }; static void sgv_kobj_release(struct kobject *kobj) { struct sgv_pool *pool; TRACE_ENTRY(); pool = container_of(kobj, struct sgv_pool, sgv_kobj); if (pool->sgv_kobj_release_cmpl != NULL) complete_all(pool->sgv_kobj_release_cmpl); TRACE_EXIT(); return; } static struct kobj_type sgv_pool_ktype = { .sysfs_ops = &scst_sysfs_ops, .release = sgv_kobj_release, .default_attrs = sgv_attrs, }; static int scst_sgv_sysfs_create(struct sgv_pool *pool) { int res; TRACE_ENTRY(); res = kobject_init_and_add(&pool->sgv_kobj, &sgv_pool_ktype, scst_sgv_kobj, pool->name); if (res != 0) { PRINT_ERROR("Can't add sgv pool %s to sysfs", pool->name); goto out; } out: TRACE_EXIT_RES(res); return res; } static void scst_sgv_sysfs_del(struct sgv_pool *pool) { DECLARE_COMPLETION_ONSTACK(c); TRACE_ENTRY(); pool->sgv_kobj_release_cmpl = &c; kobject_del(&pool->sgv_kobj); SCST_KOBJECT_PUT_AND_WAIT(&pool->sgv_kobj, "SGV pool", &c, &scst_pool_dep_map); TRACE_EXIT(); } static struct kobj_attribute sgv_global_stat_attr = __ATTR(global_stats, S_IRUGO | S_IWUSR, sgv_sysfs_global_stat_show, sgv_sysfs_global_stat_reset); static struct attribute *sgv_default_attrs[] = { &sgv_global_stat_attr.attr, NULL, }; static void scst_sysfs_release(struct kobject *kobj) { kfree(kobj); } static struct kobj_type sgv_ktype = { .sysfs_ops = &scst_sysfs_ops, .release = scst_sysfs_release, .default_attrs = sgv_default_attrs, }; /* * scst_add_sgv_kobj() - Initialize and add the root SGV kernel object. */ int scst_add_sgv_kobj(struct kobject *parent, const char *name) { int res; WARN_ON(scst_sgv_kobj); res = -ENOMEM; scst_sgv_kobj = kzalloc(sizeof(*scst_sgv_kobj), GFP_KERNEL); if (!scst_sgv_kobj) goto out; res = kobject_init_and_add(scst_sgv_kobj, &sgv_ktype, parent, name); if (res != 0) goto out_free; out: return res; out_free: kobject_put(scst_sgv_kobj); scst_sgv_kobj = NULL; goto out; } /** * scst_del_put_sgv_kobj() - Remove the root SGV kernel object. */ void scst_del_put_sgv_kobj(void) { WARN_ON(!scst_sgv_kobj); kobject_del(scst_sgv_kobj); kobject_put(scst_sgv_kobj); scst_sgv_kobj = NULL; }