/*
 *  scst_mem.c
 *
 *  Copyright (C) 2006 - 2018 Vladislav Bolkhovitin <vst@vlnb.net>
 *  Copyright (C) 2007 - 2018 Western Digital Corporation
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation, version 2
 *  of the License.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 */

#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/unistd.h>
#include <linux/string.h>

#ifdef INSIDE_KERNEL_TREE
#include <scst/scst.h>
#else
#include "scst.h"
#endif
#include "scst_priv.h"
#include "scst_mem.h"

#define SGV_DEFAULT_PURGE_INTERVAL	(60 * HZ)
#define SGV_MIN_SHRINK_INTERVAL		(1 * HZ)

/* Max pages freed from a pool per shrinking iteration */
#define MAX_PAGES_PER_POOL	50

bool scst_force_global_sgv_pool;

static struct sgv_pool *sgv_dma_pool_per_cpu[NR_CPUS];
static struct sgv_pool *sgv_norm_clust_pool_per_cpu[NR_CPUS];
static struct sgv_pool *sgv_norm_pool_per_cpu[NR_CPUS];

static struct sgv_pool *sgv_dma_pool_global[NR_CPUS];
static struct sgv_pool *sgv_norm_clust_pool_global[NR_CPUS];
static struct sgv_pool *sgv_norm_pool_global[NR_CPUS];

static struct sgv_pool *sgv_norm_clust_pool_main, *sgv_norm_pool_main, *sgv_dma_pool_main;

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 29)
#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_SCST_PROC)
static struct lock_class_key scst_pool_key;
static struct lockdep_map scst_pool_dep_map =
	STATIC_LOCKDEP_MAP_INIT("scst_pool_kref", &scst_pool_key);
#endif
#endif

#ifndef CONFIG_SCST_NO_TOTAL_MEM_CHECKS
static atomic_t sgv_pages_total = ATOMIC_INIT(0);
#endif

/* Both read-only */
static int sgv_hi_wmk;
static int sgv_lo_wmk;

static int sgv_max_local_pages, sgv_max_trans_pages;

static DEFINE_SPINLOCK(sgv_pools_lock); /* inner lock for sgv_pool_lock! */
static DEFINE_MUTEX(sgv_pools_mutex);

static atomic_t sgv_releases_on_hiwmk = ATOMIC_INIT(0);
static atomic_t sgv_releases_on_hiwmk_failed = ATOMIC_INIT(0);

#ifndef CONFIG_SCST_NO_TOTAL_MEM_CHECKS
static atomic_t sgv_other_total_alloc = ATOMIC_INIT(0);
#endif

#if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23))
static struct shrinker *sgv_shrinker;
#else
static struct shrinker sgv_shrinker;
#endif

static struct kmem_cache *sgv_pool_cachep;

/*
 * Protected by sgv_pools_mutex AND sgv_pools_lock for writes,
 * either one for reads.
 */
static LIST_HEAD(sgv_pools_list);

static struct kobject *scst_sgv_kobj;
static int scst_sgv_sysfs_create(struct sgv_pool *pool);
static void scst_sgv_sysfs_del(struct sgv_pool *pool);

static inline bool sgv_pool_clustered(const struct sgv_pool *pool)
{
	return pool->clustering_type != sgv_no_clustering;
}

void scst_sgv_pool_use_norm(struct scst_tgt_dev *tgt_dev)
{
	tgt_dev->tgt_dev_gfp_mask = __GFP_NOWARN;
	if (!scst_force_global_sgv_pool)
		tgt_dev->pools = sgv_norm_pool_per_cpu;
	else
		tgt_dev->pools = sgv_norm_pool_global;
	tgt_dev->tgt_dev_clust_pool = 0;
}

void scst_sgv_pool_use_norm_clust(struct scst_tgt_dev *tgt_dev)
{
	TRACE_MEM("%s", "Use clustering");
	tgt_dev->tgt_dev_gfp_mask = __GFP_NOWARN;
	if (!scst_force_global_sgv_pool)
		tgt_dev->pools = sgv_norm_clust_pool_per_cpu;
	else
		tgt_dev->pools = sgv_norm_clust_pool_global;
	tgt_dev->tgt_dev_clust_pool = 1;
}

void scst_sgv_pool_use_dma(struct scst_tgt_dev *tgt_dev)
{
	TRACE_MEM("%s", "Use ISA DMA memory");
	tgt_dev->tgt_dev_gfp_mask = __GFP_NOWARN | GFP_DMA;
	if (!scst_force_global_sgv_pool)
		tgt_dev->pools = sgv_dma_pool_per_cpu;
	else
		tgt_dev->pools = sgv_dma_pool_global;
	tgt_dev->tgt_dev_clust_pool = 0;
}

/* Must be no locks */
static void sgv_dtor_and_free(struct sgv_pool_obj *obj)
{
	struct sgv_pool *pool = obj->owner_pool;

	TRACE_MEM("Destroying sgv obj %p", obj);

	if (obj->sg_count != 0) {
		pool->alloc_fns.free_pages_fn(obj->sg_entries,
			obj->sg_count, obj->allocator_priv);
	}
	if (obj->sg_entries != obj->sg_entries_data) {
		if (obj->trans_tbl !=
		    (struct trans_tbl_ent *)obj->sg_entries_data) {
			/* kfree() handles NULL parameter */
			kfree(obj->trans_tbl);
			obj->trans_tbl = NULL;
		}
		kfree(obj->sg_entries);
	}

	kmem_cache_free(pool->caches[obj->cache_num], obj);
	return;
}

/* Must be called under sgv_pool_lock held */
static void sgv_dec_cached_entries(struct sgv_pool *pool, int pages)
{
	pool->cached_entries--;
	pool->cached_pages -= pages;
}

/* Must be called under sgv_pool_lock held */
static void __sgv_purge_from_cache(struct sgv_pool_obj *obj)
{
	int pages = obj->pages;
	struct sgv_pool *pool = obj->owner_pool;

	TRACE_MEM("Purging sgv obj %p from pool %p (new cached_entries %d)",
		obj, pool, pool->cached_entries-1);

	list_del(&obj->sorted_recycling_list_entry);
	list_del(&obj->recycling_list_entry);

	pool->inactive_cached_pages -= pages;
	sgv_dec_cached_entries(pool, pages);

#ifndef CONFIG_SCST_NO_TOTAL_MEM_CHECKS
	atomic_sub(pages, &sgv_pages_total);
#endif

	return;
}

/* Must be called under sgv_pool_lock held */
static bool sgv_purge_from_cache(struct sgv_pool_obj *obj, int min_interval,
	unsigned long cur_time)
{
	EXTRACHECKS_BUG_ON(min_interval < 0);

	TRACE_MEM("Checking if sgv obj %p should be purged (cur time %ld, "
		"obj time %ld, time to purge %ld)", obj, cur_time,
		obj->time_stamp, obj->time_stamp + min_interval);

	if (time_after_eq(cur_time, (obj->time_stamp + min_interval))) {
		__sgv_purge_from_cache(obj);
		return true;
	}
	return false;
}

/* No locks */
static int sgv_shrink_pool(struct sgv_pool *pool, int nr, int min_interval,
	unsigned long cur_time, int *out_freed)
{
	int freed = 0;

	TRACE_ENTRY();

	TRACE_MEM("Trying to shrink pool %p (nr %d, min_interval %d)",
		pool, nr, min_interval);

	if (pool->purge_interval < 0) {
		TRACE_MEM("Not shrinkable pool %p, skipping", pool);
		goto out;
	}

	spin_lock_bh(&pool->sgv_pool_lock);

	while (!list_empty(&pool->sorted_recycling_list) &&
#ifdef CONFIG_SCST_NO_TOTAL_MEM_CHECKS
			true) {
#else
			(atomic_read(&sgv_pages_total) > sgv_lo_wmk)) {
#endif
		struct sgv_pool_obj *obj = list_first_entry(
			&pool->sorted_recycling_list,
			struct sgv_pool_obj, sorted_recycling_list_entry);

		if (sgv_purge_from_cache(obj, min_interval, cur_time)) {
			int pages = obj->pages;

			freed += pages;
			nr -= pages;

			TRACE_MEM("%d pages purged from pool %p (nr left %d, "
				"total freed %d)", pages, pool, nr, freed);

			spin_unlock_bh(&pool->sgv_pool_lock);
			sgv_dtor_and_free(obj);
			spin_lock_bh(&pool->sgv_pool_lock);
		} else
			break;

		if ((nr <= 0) || (freed >= MAX_PAGES_PER_POOL)) {
			if (freed >= MAX_PAGES_PER_POOL)
				TRACE_MEM("%d pages purged from pool %p, "
					"leaving", freed, pool);
			break;
		}
	}

	spin_unlock_bh(&pool->sgv_pool_lock);

out:
	*out_freed += freed;

	TRACE_EXIT_RES(nr);
	return nr;
}

/* No locks */
static int __sgv_shrink(int nr, int min_interval, int *out_freed)
{
	struct sgv_pool *pool;
	unsigned long cur_time = jiffies;
	int prev_nr = nr + 1;

	TRACE_ENTRY();

	TRACE_MEM("Trying to shrink %d pages from all sgv pools "
		"(min_interval %d)", nr, min_interval);

	while (prev_nr > nr && nr > 0) {
		prev_nr = nr;

		rcu_read_lock();
		list_for_each_entry_rcu(pool, &sgv_pools_list,
					sgv_pools_list_entry) {
			if (pool->cached_entries)
				nr = sgv_shrink_pool(pool, nr, min_interval,
						     cur_time, out_freed);
		}
		rcu_read_unlock();
	}

	TRACE_EXIT_RES(nr);
	return nr;
}

static unsigned long __sgv_can_be_shrunk(void)
{
	unsigned long res;
	struct sgv_pool *pool;
	int inactive_pages = 0;

	TRACE_ENTRY();

	spin_lock_bh(&sgv_pools_lock);
	list_for_each_entry(pool, &sgv_pools_list, sgv_pools_list_entry) {
		if (pool->purge_interval > 0)
			inactive_pages += pool->inactive_cached_pages;
	}
	spin_unlock_bh(&sgv_pools_lock);

	res = max(0, inactive_pages - sgv_lo_wmk);
#ifndef CONFIG_SCST_NO_TOTAL_MEM_CHECKS
	TRACE_MEM("Can free %ld (total %d)", res, atomic_read(&sgv_pages_total));
#endif

	TRACE_EXIT_RES(res);
	return res;
}

#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 12, 0)
static unsigned long sgv_can_be_shrunk(struct shrinker *shrinker,
					 struct shrink_control *sc)
{
	return __sgv_can_be_shrunk();
}

static unsigned long sgv_scan_shrink(struct shrinker *shrinker,
				     struct shrink_control *sc)
{
	int freed = 0;

	TRACE_ENTRY();

	__sgv_shrink(sc->nr_to_scan, SGV_MIN_SHRINK_INTERVAL, &freed);
	TRACE_MEM("Freed %d", freed);

	TRACE_EXIT_RES(freed);
	return freed;
}
#else /* if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 12, 0) */
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 35) && (!defined(RHEL_MAJOR) || RHEL_MAJOR -0 < 6)
static int sgv_shrink(int nr, gfp_t gfpm)
#elif LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0)
static int sgv_shrink(struct shrinker *shrinker, int nr, gfp_t gfpm)
#else
static int sgv_shrink(struct shrinker *shrinker, struct shrink_control *sc)
#endif
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)
	int nr = sc->nr_to_scan;
#endif
	int freed = 0;

	TRACE_ENTRY();

	if (nr > 0) {
		nr = __sgv_shrink(nr, SGV_MIN_SHRINK_INTERVAL, &freed);
		TRACE_MEM("Left %d", nr);
	} else
		nr = __sgv_can_be_shrunk();

	TRACE_EXIT_RES(nr);
	return nr;
}
#endif /* if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 12, 0) */

#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20)
static void sgv_purge_work_fn(void *p)
#else
static void sgv_purge_work_fn(struct work_struct *work)
#endif
{
	unsigned long cur_time = jiffies;
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20)
	struct sgv_pool *pool = (struct sgv_pool *)p;
#else
	struct sgv_pool *pool = container_of(work, struct sgv_pool,
					     sgv_purge_work.work);
#endif

	TRACE_ENTRY();

	TRACE_MEM("Purge work for pool %p", pool);

	spin_lock_bh(&pool->sgv_pool_lock);

	pool->purge_work_scheduled = false;

	while (!list_empty(&pool->sorted_recycling_list)) {
		struct sgv_pool_obj *obj = list_first_entry(
			&pool->sorted_recycling_list,
			struct sgv_pool_obj, sorted_recycling_list_entry);

		if (sgv_purge_from_cache(obj, pool->purge_interval, cur_time)) {
			spin_unlock_bh(&pool->sgv_pool_lock);
			sgv_dtor_and_free(obj);
			spin_lock_bh(&pool->sgv_pool_lock);
		} else {
			/*
			 * Let's reschedule it for full period to not get here
			 * too often. In the worst case we have shrinker
			 * to reclaim buffers more quickly.
			 */
			TRACE_MEM("Rescheduling purge work for pool %p (delay "
				"%d HZ/%d sec)", pool, pool->purge_interval,
				pool->purge_interval/HZ);
			schedule_delayed_work(&pool->sgv_purge_work,
				pool->purge_interval);
			pool->purge_work_scheduled = true;
			break;
		}
	}

	spin_unlock_bh(&pool->sgv_pool_lock);

	TRACE_MEM("Leaving purge work for pool %p", pool);

	TRACE_EXIT();
	return;
}

static int sgv_check_full_clustering(struct scatterlist *sg, int cur, int hint)
{
	int res = -1;
	int i = hint;
	unsigned long pfn_cur = page_to_pfn(sg_page(&sg[cur]));
	int len_cur = sg[cur].length;
	unsigned long pfn_cur_next = pfn_cur + (len_cur >> PAGE_SHIFT);
	int full_page_cur = (len_cur & (PAGE_SIZE - 1)) == 0;
	unsigned long pfn, pfn_next;
	bool full_page;

#if 0
	TRACE_MEM("pfn_cur %ld, pfn_cur_next %ld, len_cur %d, full_page_cur %d",
		pfn_cur, pfn_cur_next, len_cur, full_page_cur);
#endif

	/* check the hint first */
	if (i >= 0) {
		pfn = page_to_pfn(sg_page(&sg[i]));
		pfn_next = pfn + (sg[i].length >> PAGE_SHIFT);
		full_page = (sg[i].length & (PAGE_SIZE - 1)) == 0;

		if ((pfn == pfn_cur_next) && full_page_cur)
			goto out_head;

		if ((pfn_next == pfn_cur) && full_page)
			goto out_tail;
	}

	/* ToDo: implement more intelligent search */
	for (i = cur - 1; i >= 0; i--) {
		pfn = page_to_pfn(sg_page(&sg[i]));
		pfn_next = pfn + (sg[i].length >> PAGE_SHIFT);
		full_page = (sg[i].length & (PAGE_SIZE - 1)) == 0;

		if ((pfn == pfn_cur_next) && full_page_cur)
			goto out_head;

		if ((pfn_next == pfn_cur) && full_page)
			goto out_tail;
	}

out:
	return res;

out_tail:
	TRACE_MEM("SG segment %d will be tail merged with segment %d", cur, i);
	sg[i].length += len_cur;
	sg_clear(&sg[cur]);
	res = i;
	goto out;

out_head:
	TRACE_MEM("SG segment %d will be head merged with segment %d", cur, i);
	sg_assign_page(&sg[i], sg_page(&sg[cur]));
	sg[i].length += len_cur;
	sg_clear(&sg[cur]);
	res = i;
	goto out;
}

static int sgv_check_tail_clustering(struct scatterlist *sg, int cur, int hint)
{
	int res = -1;
	unsigned long pfn_cur = page_to_pfn(sg_page(&sg[cur]));
	int len_cur = sg[cur].length;
	int prev;
	unsigned long pfn_prev;
	bool full_page;

#ifdef SCST_HIGHMEM
	if (page >= highmem_start_page) {
		TRACE_MEM("%s", "HIGHMEM page allocated, no clustering")
		goto out;
	}
#endif

#if 0
	TRACE_MEM("pfn_cur %ld, pfn_cur_next %ld, len_cur %d, full_page_cur %d",
		pfn_cur, pfn_cur_next, len_cur, full_page_cur);
#endif

	if (cur == 0)
		goto out;

	prev = cur - 1;
	pfn_prev = page_to_pfn(sg_page(&sg[prev])) +
			(sg[prev].length >> PAGE_SHIFT);
	full_page = (sg[prev].length & (PAGE_SIZE - 1)) == 0;

	if ((pfn_prev == pfn_cur) && full_page) {
		TRACE_MEM("SG segment %d will be tail merged with segment %d",
			cur, prev);
		sg[prev].length += len_cur;
		sg_clear(&sg[cur]);
		res = prev;
	}

out:
	return res;
}

static void sgv_free_sys_sg_entries(struct scatterlist *sg, int sg_count,
	void *priv)
{
	int i;

	TRACE_MEM("sg=%p, sg_count=%d", sg, sg_count);

	for (i = 0; i < sg_count; i++) {
		struct page *p = sg_page(&sg[i]);
		int len = sg[i].length;
		int pages = PAGE_ALIGN(len) >> PAGE_SHIFT;

		TRACE_MEM("page %lx, len %d, pages %d",
			(unsigned long)p, len, pages);

		while (pages > 0) {
			int order = 0;

			TRACE_MEM("free_pages(): order %d, page %lx",
				order, (unsigned long)p);

			__free_pages(p, order);

			pages -= 1 << order;
			p += 1 << order;
		}
	}
}

static struct page *sgv_alloc_sys_pages(struct scatterlist *sg,
	gfp_t gfp_mask, void *priv)
{
	struct page *page = alloc_pages(gfp_mask, 0);

	sg_set_page(sg, page, PAGE_SIZE, 0);
	TRACE_MEM("page=%p, sg=%p, priv=%p", page, sg, priv);
	if (page == NULL) {
		TRACE(TRACE_OUT_OF_MEM, "%s", "Allocation of "
			"sg page failed");
	}
	return page;
}

static int sgv_alloc_sg_entries(struct scatterlist *sg, int pages,
	gfp_t gfp_mask, enum sgv_clustering_types clustering_type,
	struct trans_tbl_ent *trans_tbl,
	const struct sgv_pool_alloc_fns *alloc_fns, void *priv)
{
	int sg_count = 0;
	int pg, i, j;
	int merged = -1;

	TRACE_MEM("pages=%d, clustering_type=%d", pages, clustering_type);

#if 0
	gfp_mask |= __GFP_COLD;
#endif
#ifdef CONFIG_SCST_STRICT_SECURITY
	gfp_mask |= __GFP_ZERO;
#endif

	for (pg = 0; pg < pages; pg++) {
		void *rc;
#ifdef CONFIG_SCST_DEBUG_OOM
		if (((gfp_mask & __GFP_NOFAIL) != __GFP_NOFAIL) &&
		    ((scst_random() % 10000) == 55))
			rc = NULL;
		else
#endif
			rc = alloc_fns->alloc_pages_fn(&sg[sg_count], gfp_mask,
				priv);
		if (rc == NULL)
			goto out_no_mem;

		/*
		 * This code allows compiler to see full body of the clustering
		 * functions and gives it a chance to generate better code.
		 * At least, the resulting code is smaller, comparing to
		 * calling them using a function pointer.
		 */
		if (clustering_type == sgv_full_clustering)
			merged = sgv_check_full_clustering(sg, sg_count, merged);
		else if (clustering_type == sgv_tail_clustering)
			merged = sgv_check_tail_clustering(sg, sg_count, merged);
		else
			merged = -1;

		if (merged == -1)
			sg_count++;

		TRACE_MEM("pg=%d, merged=%d, sg_count=%d", pg, merged,
			sg_count);
	}

	if ((clustering_type != sgv_no_clustering) && (trans_tbl != NULL)) {
		pg = 0;
		for (i = 0; i < pages; i++) {
			int n = PAGE_ALIGN(sg[i].length) >> PAGE_SHIFT;

			trans_tbl[i].pg_count = pg;
			for (j = 0; j < n; j++)
				trans_tbl[pg++].sg_num = i+1;
			TRACE_MEM("i=%d, n=%d, pg_count=%d", i, n,
				trans_tbl[i].pg_count);
		}
	}

out:
	TRACE_MEM("sg_count=%d", sg_count);
	return sg_count;

out_no_mem:
	alloc_fns->free_pages_fn(sg, sg_count, priv);
	sg_count = 0;
	goto out;
}

static int sgv_alloc_arrays(struct sgv_pool_obj *obj,
	int pages_to_alloc, gfp_t gfp_mask)
{
	int sz, tsz = 0;
	int res = 0;

	TRACE_ENTRY();

	sz = pages_to_alloc * sizeof(obj->sg_entries[0]);

	obj->sg_entries = kmalloc(sz, gfp_mask);
	if (unlikely(obj->sg_entries == NULL)) {
		TRACE(TRACE_OUT_OF_MEM, "Allocation of sgv_pool_obj "
			"SG vector failed (size %d)", sz);
		res = -ENOMEM;
		goto out;
	}

	sg_init_table(obj->sg_entries, pages_to_alloc);

	if (sgv_pool_clustered(obj->owner_pool)) {
		if (pages_to_alloc <= sgv_max_trans_pages) {
			obj->trans_tbl =
				(struct trans_tbl_ent *)obj->sg_entries_data;
			/*
			 * No need to clear trans_tbl, if needed, it will be
			 * fully rewritten in sgv_alloc_sg_entries()
			 */
		} else {
			tsz = pages_to_alloc * sizeof(obj->trans_tbl[0]);
			obj->trans_tbl = kzalloc(tsz, gfp_mask);
			if (unlikely(obj->trans_tbl == NULL)) {
				TRACE(TRACE_OUT_OF_MEM, "Allocation of "
					"trans_tbl failed (size %d)", tsz);
				res = -ENOMEM;
				goto out_free;
			}
		}
	}

	TRACE_MEM("pages_to_alloc %d, sz %d, tsz %d, obj %p, sg_entries %p, "
		"trans_tbl %p", pages_to_alloc, sz, tsz, obj, obj->sg_entries,
		obj->trans_tbl);

out:
	TRACE_EXIT_RES(res);
	return res;

out_free:
	kfree(obj->sg_entries);
	obj->sg_entries = NULL;
	goto out;
}

static struct sgv_pool_obj *sgv_get_obj(struct sgv_pool *pool, int cache_num,
	int pages, gfp_t gfp_mask, bool get_new)
{
	struct sgv_pool_obj *obj;

	spin_lock_bh(&pool->sgv_pool_lock);

	if (unlikely(get_new)) {
		/* Used only for buffers preallocation */
		goto get_new;
	}

	if (likely(!list_empty(&pool->recycling_lists[cache_num]))) {
		obj = list_first_entry(&pool->recycling_lists[cache_num],
			 struct sgv_pool_obj, recycling_list_entry);

		list_del(&obj->sorted_recycling_list_entry);
		list_del(&obj->recycling_list_entry);

		pool->inactive_cached_pages -= pages;

		spin_unlock_bh(&pool->sgv_pool_lock);
		goto out;
	}

get_new:
	pool->cached_entries++;
	pool->cached_pages += pages;

	spin_unlock_bh(&pool->sgv_pool_lock);

	TRACE_MEM("New cached entries %d (pool %p)", pool->cached_entries,
		pool);

	obj = kmem_cache_alloc(pool->caches[cache_num],
		gfp_mask & ~(__GFP_HIGHMEM|GFP_DMA));
	if (likely(obj)) {
		memset(obj, 0, sizeof(*obj));
		obj->cache_num = cache_num;
		obj->pages = pages;
		obj->owner_pool = pool;
	} else {
		spin_lock_bh(&pool->sgv_pool_lock);
		sgv_dec_cached_entries(pool, pages);
		spin_unlock_bh(&pool->sgv_pool_lock);
	}

out:
	return obj;
}

static void sgv_put_obj(struct sgv_pool_obj *obj)
{
	struct sgv_pool *pool = obj->owner_pool;
	struct list_head *entry;
	struct list_head *list = &pool->recycling_lists[obj->cache_num];
	int pages = obj->pages;

	spin_lock_bh(&pool->sgv_pool_lock);

	TRACE_MEM("sgv %p, cache num %d, pages %d, sg_count %d", obj,
		obj->cache_num, pages, obj->sg_count);

	if (sgv_pool_clustered(pool)) {
		/* Make objects with less entries more preferred */
		__list_for_each(entry, list) {
			struct sgv_pool_obj *tmp = list_entry(entry,
				struct sgv_pool_obj, recycling_list_entry);

			TRACE_MEM("tmp %p, cache num %d, pages %d, sg_count %d",
				tmp, tmp->cache_num, tmp->pages, tmp->sg_count);

			if (obj->sg_count <= tmp->sg_count)
				break;
		}
		entry = entry->prev;
	} else
		entry = list;

	TRACE_MEM("Adding in %p (list %p)", entry, list);
	list_add(&obj->recycling_list_entry, entry);

	list_add_tail(&obj->sorted_recycling_list_entry,
		&pool->sorted_recycling_list);

	obj->time_stamp = jiffies;

	pool->inactive_cached_pages += pages;

	if (!pool->purge_work_scheduled) {
		TRACE_MEM("Scheduling purge work for pool %p", pool);
		pool->purge_work_scheduled = true;
		schedule_delayed_work(&pool->sgv_purge_work,
			pool->purge_interval);
	}

	spin_unlock_bh(&pool->sgv_pool_lock);
	return;
}

/* No locks */
static int sgv_hiwmk_check(int pages_to_alloc)
{
	int res = 0;
#ifndef CONFIG_SCST_NO_TOTAL_MEM_CHECKS
	int pages = pages_to_alloc;

	pages += atomic_read(&sgv_pages_total);

	if (unlikely(pages > sgv_hi_wmk)) {
		int freed = 0;

		pages -= sgv_hi_wmk;
		atomic_inc(&sgv_releases_on_hiwmk);

		pages = __sgv_shrink(pages, 0, &freed);
		if (pages > 0) {
			TRACE(TRACE_OUT_OF_MEM, "Requested amount of "
			    "memory (%d pages) for being executed "
			    "commands together with the already "
			    "allocated memory exceeds the allowed "
			    "maximum %d. Should you increase "
			    "scst_max_cmd_mem?", pages_to_alloc,
			   sgv_hi_wmk);
			atomic_inc(&sgv_releases_on_hiwmk_failed);
			res = -ENOMEM;
			goto out_unlock;
		}
	}

	atomic_add(pages_to_alloc, &sgv_pages_total);

out_unlock:
	TRACE_MEM("pages_to_alloc %d, new total %d", pages_to_alloc,
		atomic_read(&sgv_pages_total));
#endif
	return res;
}

/* No locks */
static void sgv_hiwmk_uncheck(int pages)
{
#ifndef CONFIG_SCST_NO_TOTAL_MEM_CHECKS
	atomic_sub(pages, &sgv_pages_total);
	TRACE_MEM("pages %d, new total %d", pages,
		atomic_read(&sgv_pages_total));
#endif
	return;
}

/* No locks */
static bool sgv_check_allowed_mem(struct scst_mem_lim *mem_lim, int pages)
{
	int alloced;
	bool res = true;

	alloced = atomic_add_return(pages, &mem_lim->alloced_pages);
	if (unlikely(alloced > mem_lim->max_allowed_pages)) {
		TRACE(TRACE_OUT_OF_MEM, "Requested amount of memory "
			"(%d pages) for being executed commands on a device "
			"together with the already allocated memory exceeds "
			"the allowed maximum %d. Should you increase "
			"scst_max_dev_cmd_mem?", pages,
			mem_lim->max_allowed_pages);
		atomic_sub(pages, &mem_lim->alloced_pages);
		res = false;
	}

	TRACE_MEM("mem_lim %p, pages %d, res %d, new alloced %d", mem_lim,
		pages, res, atomic_read(&mem_lim->alloced_pages));

	return res;
}

/* No locks */
static void sgv_uncheck_allowed_mem(struct scst_mem_lim *mem_lim, int pages)
{
	atomic_sub(pages, &mem_lim->alloced_pages);

	TRACE_MEM("mem_lim %p, pages %d, new alloced %d", mem_lim,
		pages, atomic_read(&mem_lim->alloced_pages));
	return;
}

/**
 * sgv_pool_alloc - allocate an SG vector from the SGV pool
 * @pool:	the cache to alloc from
 * @size:	size of the resulting SG vector in bytes
 * @gfp_mask:	the allocation mask
 * @flags:	the allocation flags
 * @count:	the resulting count of SG entries in the resulting SG vector
 * @sgv:	the resulting SGV object
 * @mem_lim:	memory limits
 * @priv:	pointer to private for this allocation data
 *
 * Description:
 *    Allocate an SG vector from the SGV pool and returns pointer to it or
 *    NULL in case of any error. See the SGV pool documentation for more details.
 */
struct scatterlist *sgv_pool_alloc(struct sgv_pool *pool, unsigned int size,
	gfp_t gfp_mask, int flags, int *count,
	struct sgv_pool_obj **sgv, struct scst_mem_lim *mem_lim, void *priv)
{
	struct sgv_pool_obj *obj;
	int cache_num, pages, cnt;
	struct scatterlist *res = NULL;
	int pages_to_alloc;
	int no_cached = flags & SGV_POOL_ALLOC_NO_CACHED;
	bool allowed_mem_checked = false, hiwmk_checked = false;

	TRACE_ENTRY();

	if (unlikely(size == 0))
		goto out;

	EXTRACHECKS_BUG_ON((gfp_mask & __GFP_NOFAIL) == __GFP_NOFAIL);

	pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
	if (pool->single_alloc_pages == 0) {
		int pages_order = get_order(size);

		cache_num = pages_order;
		pages_to_alloc = (1 << pages_order);
	} else {
		cache_num = 0;
		pages_to_alloc = max(pool->single_alloc_pages, pages);
	}

	TRACE_MEM("size=%d, pages=%d, pages_to_alloc=%d, cache num=%d, "
		"flags=%x, no_cached=%d, *sgv=%p", size, pages,
		pages_to_alloc, cache_num, flags, no_cached, *sgv);

	if (*sgv != NULL) {
		obj = *sgv;

		TRACE_MEM("Supplied obj %p, cache num %d", obj, obj->cache_num);

		EXTRACHECKS_BUG_ON(obj->sg_count != 0);

		if (unlikely(!sgv_check_allowed_mem(mem_lim, pages_to_alloc)))
			goto out_fail_free_sg_entries;
		allowed_mem_checked = true;

		if (unlikely(sgv_hiwmk_check(pages_to_alloc) != 0))
			goto out_fail_free_sg_entries;
		hiwmk_checked = true;
	} else if ((pages_to_alloc <= pool->max_cached_pages) && !no_cached) {
		if (unlikely(!sgv_check_allowed_mem(mem_lim, pages_to_alloc)))
			goto out_fail;
		allowed_mem_checked = true;

		obj = sgv_get_obj(pool, cache_num, pages_to_alloc, gfp_mask,
			flags & SGV_POOL_ALLOC_GET_NEW);
		if (unlikely(obj == NULL)) {
			TRACE(TRACE_OUT_OF_MEM, "Allocation of "
				"sgv_pool_obj failed (size %d)", size);
			goto out_fail;
		}

		if (obj->sg_count != 0) {
			TRACE_MEM("Cached obj %p", obj);
			atomic_inc(&pool->cache_acc[cache_num].hit_alloc);
			goto success;
		}

		if (flags & SGV_POOL_NO_ALLOC_ON_CACHE_MISS) {
			if (!(flags & SGV_POOL_RETURN_OBJ_ON_ALLOC_FAIL))
				goto out_fail_free;
		}

		if (likely(!obj->recycling_list_entry.next)) {
			TRACE_MEM("Brand new obj %p", obj);
		} else if (unlikely(obj->sg_entries != obj->sg_entries_data)) {
			TRACE_MEM("Cached obj %p with sg_count == 0", obj);
			kfree(obj->sg_entries);
			obj->sg_entries = NULL;
		}

		if (pages_to_alloc <= sgv_max_local_pages) {
			obj->sg_entries = obj->sg_entries_data;
			sg_init_table(obj->sg_entries, pages_to_alloc);
			TRACE_MEM("sg_entries %p", obj->sg_entries);
			if (sgv_pool_clustered(pool)) {
				obj->trans_tbl = (struct trans_tbl_ent *)
					(obj->sg_entries + pages_to_alloc);
				TRACE_MEM("trans_tbl %p", obj->trans_tbl);
				/*
				 * No need to clear trans_tbl, if needed, it
				 * will be fully rewritten in
				 * sgv_alloc_sg_entries().
				 */
			}
		} else {
			if (unlikely(sgv_alloc_arrays(obj, pages_to_alloc,
					gfp_mask) != 0))
				goto out_fail_free;
		}

		if ((flags & SGV_POOL_NO_ALLOC_ON_CACHE_MISS) &&
		    (flags & SGV_POOL_RETURN_OBJ_ON_ALLOC_FAIL))
			goto out_return;

		obj->allocator_priv = priv;

		if (unlikely(sgv_hiwmk_check(pages_to_alloc) != 0))
			goto out_fail_free_sg_entries;
		hiwmk_checked = true;
	} else {
		int sz;

		pages_to_alloc = pages;

		if (unlikely(!sgv_check_allowed_mem(mem_lim, pages_to_alloc)))
			goto out_fail;
		allowed_mem_checked = true;

		if (flags & SGV_POOL_NO_ALLOC_ON_CACHE_MISS)
			goto out_return2;

		sz = sizeof(*obj) + pages * sizeof(obj->sg_entries[0]);

		obj = kmalloc(sz, gfp_mask);
		if (unlikely(obj == NULL)) {
			TRACE(TRACE_OUT_OF_MEM, "Allocation of "
				"sgv_pool_obj failed (size %d)", size);
			goto out_fail;
		}
		memset(obj, 0, sizeof(*obj));

		obj->owner_pool = pool;
		cache_num = -1;
		obj->cache_num = cache_num;
		obj->pages = pages_to_alloc;
		obj->allocator_priv = priv;

		obj->sg_entries = obj->sg_entries_data;
		sg_init_table(obj->sg_entries, pages);

		if (unlikely(sgv_hiwmk_check(pages_to_alloc) != 0))
			goto out_fail_free_sg_entries;
		hiwmk_checked = true;

		TRACE_MEM("Big or no_cached obj %p (size %d)", obj, sz);
	}

	obj->sg_count = sgv_alloc_sg_entries(obj->sg_entries,
		pages_to_alloc, gfp_mask, pool->clustering_type,
		obj->trans_tbl, &pool->alloc_fns, priv);
	if (unlikely(obj->sg_count <= 0)) {
		obj->sg_count = 0;
		if ((flags & SGV_POOL_RETURN_OBJ_ON_ALLOC_FAIL) &&
		    (cache_num >= 0))
			goto out_return1;
		else
			goto out_fail_free_sg_entries;
	}

	if (cache_num >= 0) {
		atomic_add(pages_to_alloc - obj->sg_count,
			&pool->cache_acc[cache_num].merged);
	} else {
		if (no_cached) {
			atomic_add(pages_to_alloc,
				&pool->other_pages);
			atomic_add(pages_to_alloc - obj->sg_count,
				&pool->other_merged);
		} else {
			atomic_add(pages_to_alloc,
				&pool->big_pages);
			atomic_add(pages_to_alloc - obj->sg_count,
				&pool->big_merged);
		}
	}

success:
	if (cache_num >= 0) {
		int sg;

		atomic_inc(&pool->cache_acc[cache_num].total_alloc);
		if (sgv_pool_clustered(pool))
			cnt = obj->trans_tbl[pages-1].sg_num;
		else
			cnt = pages;
		sg = cnt-1;
		obj->orig_sg = sg;
		obj->orig_length = obj->sg_entries[sg].length;
		if (sgv_pool_clustered(pool)) {
			obj->sg_entries[sg].length =
				(pages - obj->trans_tbl[sg].pg_count) << PAGE_SHIFT;
		}
	} else {
		cnt = obj->sg_count;
		if (no_cached)
			atomic_inc(&pool->other_alloc);
		else
			atomic_inc(&pool->big_alloc);
	}

	*count = cnt;
	res = obj->sg_entries;
	*sgv = obj;

	obj->sg_entries[cnt-1].length -= PAGE_ALIGN(size) - size;
	sg_mark_end(&obj->sg_entries[cnt-1]);

	TRACE_MEM("obj=%p, sg_entries %p (size=%d, pages=%d, sg_count=%d, "
		"count=%d, last_len=%d)", obj, obj->sg_entries, size, pages,
		obj->sg_count, *count, obj->sg_entries[obj->orig_sg].length);

out:
	TRACE_EXIT_HRES(res);
	return res;

out_return:
	obj->allocator_priv = priv;
	obj->owner_pool = pool;

out_return1:
	*sgv = obj;
	TRACE_MEM("Returning failed obj %p", obj);

out_return2:
	*count = pages_to_alloc;
	res = NULL;
	goto out_uncheck;

out_fail_free_sg_entries:
	if (obj->sg_entries != obj->sg_entries_data) {
		if (obj->trans_tbl !=
			(struct trans_tbl_ent *)obj->sg_entries_data) {
			/* kfree() handles NULL parameter */
			kfree(obj->trans_tbl);
			obj->trans_tbl = NULL;
		}
		kfree(obj->sg_entries);
		obj->sg_entries = NULL;
	}

out_fail_free:
	if (cache_num >= 0) {
		spin_lock_bh(&pool->sgv_pool_lock);
		sgv_dec_cached_entries(pool, pages_to_alloc);
		spin_unlock_bh(&pool->sgv_pool_lock);

		kmem_cache_free(pool->caches[obj->cache_num], obj);
	} else
		kfree(obj);

out_fail:
	res = NULL;
	*count = 0;
	*sgv = NULL;
	TRACE_MEM("%s", "Allocation failed");

out_uncheck:
	if (hiwmk_checked)
		sgv_hiwmk_uncheck(pages_to_alloc);
	if (allowed_mem_checked)
		sgv_uncheck_allowed_mem(mem_lim, pages_to_alloc);
	goto out;
}
EXPORT_SYMBOL_GPL(sgv_pool_alloc);

/*
 * sgv_get_priv - return the private allocation data
 *
 * Allows to get the allocation private data for this SGV
 * cache object. The private data supposed to be set by sgv_pool_alloc().
 */
void *sgv_get_priv(struct sgv_pool_obj *obj)
{
	return obj->allocator_priv;
}
EXPORT_SYMBOL_GPL(sgv_get_priv);

/**
 * sgv_pool_free - free previously allocated SG vector
 * @obj:	the SGV object to free
 * @mem_lim:	memory limits
 *
 * Description:
 *    Frees previously allocated SG vector and updates memory limits
 */
void sgv_pool_free(struct sgv_pool_obj *obj, struct scst_mem_lim *mem_lim)
{
	int pages = (obj->sg_count != 0) ? obj->pages : 0;

	TRACE_MEM("Freeing obj %p, cache num %d, pages %d, sg_entries %p, "
		"sg_count %d, allocator_priv %p", obj, obj->cache_num, pages,
		obj->sg_entries, obj->sg_count, obj->allocator_priv);

/*
 * Enable it if you are investigating a data corruption and want to make
 * sure that target or dev handler didn't leave the pages mapped somewhere and,
 * hence, provoked a data corruption.
 *
 * Make sure the check value for _count is set correctly. In most cases, 1 is
 * correct, but, e.g., iSCSI-SCST can call it with value 2, because
 * it frees the corresponding cmd before the last put_page() call from
 * net_put_page() for the last page in the SG. Also, user space dev handlers
 * usually have their memory mapped in their address space.
 */
#if 0
	{
		struct scatterlist *sg = obj->sg_entries;
		int i;

		for (i = 0; i < obj->sg_count; i++) {
			struct page *p = sg_page(&sg[i]);
			int len = sg[i].length;
			int pages = PAGE_ALIGN(len) >> PAGE_SHIFT;

			while (pages > 0) {
				if (page_count(p) != 1) {
					PRINT_WARNING("Freeing page %p with "
						"additional owners (_count %d). "
						"Data corruption possible!",
						p, page_count(p));
					WARN_ON(1);
				}
				pages--;
				p++;
			}
		}
	}
#endif

	if (obj->cache_num >= 0) {
		obj->sg_entries[obj->orig_sg].length = obj->orig_length;
		sg_unmark_end(&obj->sg_entries[obj->orig_sg]);
		sgv_put_obj(obj);
	} else {
		obj->owner_pool->alloc_fns.free_pages_fn(obj->sg_entries,
			obj->sg_count, obj->allocator_priv);
		kfree(obj);
		sgv_hiwmk_uncheck(pages);
	}

	sgv_uncheck_allowed_mem(mem_lim, pages);
	return;
}
EXPORT_SYMBOL_GPL(sgv_pool_free);

/*
 * scst_alloc_sg() - allocates an SG vector
 *
 * Allocates and returns pointer to SG vector with data size "size".
 * In *count returned the count of entries in the vector.
 * Returns NULL for failure.
 *
 * Please don't use it for massive commands data buffers, because it
 * isn't fair and don't account per device memory limits. Use sgv_pool_alloc()
 * instead.
 */
struct scatterlist *scst_alloc_sg(int size, gfp_t gfp_mask, int *count)
{
	struct scatterlist *res;
	int pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
	struct sgv_pool_alloc_fns sys_alloc_fns = {
		sgv_alloc_sys_pages, sgv_free_sys_sg_entries };
	int no_fail = ((gfp_mask & __GFP_NOFAIL) == __GFP_NOFAIL);
	int cnt;

	TRACE_ENTRY();

#ifndef CONFIG_SCST_NO_TOTAL_MEM_CHECKS
	atomic_inc(&sgv_other_total_alloc);
#endif

	if (unlikely(sgv_hiwmk_check(pages) != 0)) {
		if (!no_fail) {
			res = NULL;
			goto out;
		} else {
			/*
			 * Update active_pages_total since alloc can't fail.
			 * If it wasn't updated then the counter would cross 0
			 * on free again.
			 */
			sgv_hiwmk_uncheck(-pages);
		}
	}

	res = kmalloc_array(pages, sizeof(*res), gfp_mask);
	if (res == NULL) {
		TRACE(TRACE_OUT_OF_MEM, "Unable to allocate sg for %d pages",
			pages);
		goto out_uncheck;
	}

	sg_init_table(res, pages);

	/*
	 * If we allow use clustering here, we will have troubles in
	 * scst_free_sg() to figure out how many pages are in the SG vector.
	 * So, let's always don't use clustering.
	 */
	cnt = sgv_alloc_sg_entries(res, pages, gfp_mask, sgv_no_clustering,
			NULL, &sys_alloc_fns, NULL);
	if (cnt <= 0)
		goto out_free;

	res[cnt-1].length -= PAGE_ALIGN(size) - size;

	*count = cnt;

out:
	TRACE_MEM("Alloced sg %p (count %d, no_fail %d)", res, *count, no_fail);

	TRACE_EXIT_HRES(res);
	return res;

out_free:
	kfree(res);
	res = NULL;

out_uncheck:
	if (!no_fail)
		sgv_hiwmk_uncheck(pages);
	goto out;
}
EXPORT_SYMBOL_GPL(scst_alloc_sg);

/*
 * scst_free_sg() - frees SG vector
 *
 * Frees SG vector returned by scst_alloc_sg().
 */
void scst_free_sg(struct scatterlist *sg, int count)
{
	TRACE_MEM("Freeing sg=%p", sg);

	sgv_hiwmk_uncheck(count);

	sgv_free_sys_sg_entries(sg, count, NULL);
	kfree(sg);
	return;
}
EXPORT_SYMBOL_GPL(scst_free_sg);

/* Must be called under sgv_pools_mutex */
static void sgv_pool_init_cache(struct sgv_pool *pool, int cache_num,
	bool per_cpu)
{
	int size;
	int pages;
	struct sgv_pool_obj *obj;

	atomic_set(&pool->cache_acc[cache_num].total_alloc, 0);
	atomic_set(&pool->cache_acc[cache_num].hit_alloc, 0);
	atomic_set(&pool->cache_acc[cache_num].merged, 0);

	if (pool->single_alloc_pages == 0)
		pages = 1 << cache_num;
	else
		pages = pool->single_alloc_pages;

	if (pages <= sgv_max_local_pages) {
		size = sizeof(*obj) + pages *
			(sizeof(obj->sg_entries[0]) +
			 ((pool->clustering_type != sgv_no_clustering) ?
				sizeof(obj->trans_tbl[0]) : 0));
	} else if (pages <= sgv_max_trans_pages) {
		/*
		 * sg_entries is allocated outside object,
		 * but trans_tbl is still embedded.
		 */
		size = sizeof(*obj) + pages *
			(((pool->clustering_type != sgv_no_clustering) ?
				sizeof(obj->trans_tbl[0]) : 0));
	} else {
		size = sizeof(*obj);
		/* both sgv and trans_tbl are kmalloc'ed() */
	}

	TRACE_MEM("pages=%d, size=%d (per cpu %d)", pages, size, per_cpu);

	scnprintf(pool->cache_names[cache_num],
		sizeof(pool->cache_names[cache_num]),
		"%s-%uK", pool->name, (pages << PAGE_SHIFT) >> 10);
	pool->caches[cache_num] = kmem_cache_create(
		pool->cache_names[cache_num], size,
		0, per_cpu ? SCST_SLAB_FLAGS :
			     (SCST_SLAB_FLAGS|SLAB_HWCACHE_ALIGN), NULL
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23))
		, NULL);
#else
		);
#endif
	return;
}

/* Must be called under sgv_pools_mutex */
static int sgv_pool_init(struct sgv_pool *pool, const char *name,
	enum sgv_clustering_types clustering_type, int single_alloc_pages,
	int purge_interval, bool per_cpu)
{
	int res = -ENOMEM;
	int i;

	TRACE_ENTRY();

	if (single_alloc_pages < 0) {
		PRINT_ERROR("Wrong single_alloc_pages value %d",
			single_alloc_pages);
		res = -EINVAL;
		goto out;
	}

	memset(pool, 0, sizeof(*pool));

	atomic_set(&pool->big_alloc, 0);
	atomic_set(&pool->big_pages, 0);
	atomic_set(&pool->big_merged, 0);
	atomic_set(&pool->other_alloc, 0);
	atomic_set(&pool->other_pages, 0);
	atomic_set(&pool->other_merged, 0);

	pool->clustering_type = clustering_type;
	pool->single_alloc_pages = single_alloc_pages;
	if (purge_interval != 0) {
		pool->purge_interval = purge_interval;
		if (purge_interval < 0) {
			/* Let's pretend that it's always scheduled */
			pool->purge_work_scheduled = 1;
		}
	} else
		pool->purge_interval = SGV_DEFAULT_PURGE_INTERVAL;
	if (single_alloc_pages == 0) {
		pool->max_caches = SGV_POOL_ELEMENTS;
		pool->max_cached_pages = 1 << (SGV_POOL_ELEMENTS - 1);
	} else {
		pool->max_caches = 1;
		pool->max_cached_pages = single_alloc_pages;
	}
	pool->alloc_fns.alloc_pages_fn = sgv_alloc_sys_pages;
	pool->alloc_fns.free_pages_fn = sgv_free_sys_sg_entries;

	TRACE_MEM("name %s, sizeof(*obj)=%zd, clustering_type=%d, "
		"single_alloc_pages=%d, max_caches=%d, max_cached_pages=%d",
		name, sizeof(struct sgv_pool_obj), clustering_type,
		single_alloc_pages, pool->max_caches, pool->max_cached_pages);

	strlcpy(pool->name, name, sizeof(pool->name)-1);

	pool->owner_mm = current->mm;

	for (i = 0; i < pool->max_caches; i++) {
		sgv_pool_init_cache(pool, i, per_cpu);
		if (pool->caches[i] == NULL) {
			PRINT_ERROR("Allocation of sgv_pool "
				"cache %s(%d) failed", name, i);
			goto out_free;
		}
	}

	atomic_set(&pool->sgv_pool_ref, 1);
	spin_lock_init(&pool->sgv_pool_lock);
	INIT_LIST_HEAD(&pool->sorted_recycling_list);
	for (i = 0; i < pool->max_caches; i++)
		INIT_LIST_HEAD(&pool->recycling_lists[i]);

#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 20))
	INIT_DELAYED_WORK(&pool->sgv_purge_work, sgv_purge_work_fn);
#else
	INIT_WORK(&pool->sgv_purge_work, sgv_purge_work_fn, pool);
#endif

	spin_lock_bh(&sgv_pools_lock);
	list_add_tail(&pool->sgv_pools_list_entry, &sgv_pools_list);
	spin_unlock_bh(&sgv_pools_lock);

	res = scst_sgv_sysfs_create(pool);
	if (res != 0)
		goto out_del;

	res = 0;

out:
	TRACE_EXIT_RES(res);
	return res;

out_del:
	spin_lock_bh(&sgv_pools_lock);
	list_del(&pool->sgv_pools_list_entry);
	spin_unlock_bh(&sgv_pools_lock);

	synchronize_rcu();

out_free:
	for (i = 0; i < pool->max_caches; i++) {
		if (pool->caches[i]) {
			kmem_cache_destroy(pool->caches[i]);
			pool->caches[i] = NULL;
		} else
			break;
	}
	goto out;
}

static void sgv_evaluate_local_max_pages(void)
{
	int space4sgv_ttbl = PAGE_SIZE - sizeof(struct sgv_pool_obj);

	sgv_max_local_pages = space4sgv_ttbl /
		  (sizeof(struct trans_tbl_ent) + sizeof(struct scatterlist));

	sgv_max_trans_pages =  space4sgv_ttbl / sizeof(struct trans_tbl_ent);

	TRACE_MEM("sgv_max_local_pages %d, sgv_max_trans_pages %d",
		sgv_max_local_pages, sgv_max_trans_pages);
	return;
}

/*
 * sgv_pool_flush() - flushes the SGV pool.
 *
 * Flushes, i.e. frees, all the cached entries in the SGV pool.
 */
void sgv_pool_flush(struct sgv_pool *pool)
{
	int i;

	TRACE_ENTRY();

	for (i = 0; i < pool->max_caches; i++) {
		struct sgv_pool_obj *obj;

		spin_lock_bh(&pool->sgv_pool_lock);

		while (!list_empty(&pool->recycling_lists[i])) {
			obj = list_first_entry(&pool->recycling_lists[i],
				struct sgv_pool_obj, recycling_list_entry);

			__sgv_purge_from_cache(obj);

			spin_unlock_bh(&pool->sgv_pool_lock);

			EXTRACHECKS_BUG_ON(obj->owner_pool != pool);
			sgv_dtor_and_free(obj);

			spin_lock_bh(&pool->sgv_pool_lock);
		}
		spin_unlock_bh(&pool->sgv_pool_lock);
	}

	TRACE_EXIT();
	return;
}
EXPORT_SYMBOL_GPL(sgv_pool_flush);

static void sgv_pool_destroy(struct sgv_pool *pool)
{
	int i;

	TRACE_ENTRY();

	sgv_pool_flush(pool);

	mutex_lock(&sgv_pools_mutex);
	spin_lock_bh(&sgv_pools_lock);
	list_del(&pool->sgv_pools_list_entry);
	spin_unlock_bh(&sgv_pools_lock);
	mutex_unlock(&sgv_pools_mutex);

	synchronize_rcu();

	scst_sgv_sysfs_del(pool);

	cancel_delayed_work_sync(&pool->sgv_purge_work);

	for (i = 0; i < pool->max_caches; i++) {
		if (pool->caches[i])
			kmem_cache_destroy(pool->caches[i]);
		pool->caches[i] = NULL;
	}

	kmem_cache_free(sgv_pool_cachep, pool);

	TRACE_EXIT();
	return;
}

/**
 * sgv_pool_set_allocator - set custom pages allocator
 * @pool:	the cache
 * @alloc_pages_fn: pages allocation function
 * @free_pages_fn: pages freeing function
 *
 * Description:
 *    Allows to set custom pages allocator for the SGV pool.
 *    See the SGV pool documentation for more details.
 */
void sgv_pool_set_allocator(struct sgv_pool *pool,
	struct page *(*alloc_pages_fn)(struct scatterlist *, gfp_t, void *),
	void (*free_pages_fn)(struct scatterlist *, int, void *))
{
	pool->alloc_fns.alloc_pages_fn = alloc_pages_fn;
	pool->alloc_fns.free_pages_fn = free_pages_fn;
	return;
}
EXPORT_SYMBOL_GPL(sgv_pool_set_allocator);

/**
 * sgv_pool_create_node - creates and initializes an SGV pool
 * @name:	the name of the SGV pool
 * @clustering_type:	sets type of the pages clustering.
 * @single_alloc_pages:	if 0, then the SGV pool will work in the set of
 *		power 2 size buffers mode. If >0, then the SGV pool will
 *		work in the fixed size buffers mode. In this case
 *		single_alloc_pages sets the size of each buffer in pages.
 * @shared:	sets if the SGV pool can be shared between devices or not.
 *		The cache sharing allowed only between devices created inside
 *		the same address space. If an SGV pool is shared, each
 *		subsequent call of sgv_pool_create*() with the same cache name
 *		will not create a new cache, but instead return a reference
 *		to it.
 * @purge_interval: sets the cache purging interval. I.e., an SG buffer
 *		will be freed if it's unused for time t
 *		purge_interval <= t < 2*purge_interval. If purge_interval
 *		is 0, then the default interval will be used (60 seconds).
 *		If purge_interval <0, then the automatic purging will be
 *		disabled. In HZ.
 * @nodeid:	NUMA node for this pool. Can be NUMA_NO_NODE, if the
 *		caller doesn't care.
 *
 * Description:
 *    Returns the resulting SGV pool or NULL in case of any error.
 */
struct sgv_pool *sgv_pool_create_node(const char *name,
	enum sgv_clustering_types clustering_type,
	int single_alloc_pages, bool shared, int purge_interval, int nodeid)
{
	struct sgv_pool *pool, *tp;
	int rc;

	TRACE_ENTRY();

	TRACE_MEM("Creating pool %s (clustering_type %d, "
		"single_alloc_pages %d, shared %d, purge_interval %d, "
		"nodeid %d)", name, clustering_type, single_alloc_pages,
		shared, purge_interval, nodeid);

	/*
	 * __sgv_shrink() takes sgv_pools_mutex, so we have to play tricks to
	 * prevent deadlock with it if this allocation will try to reclaim memory
	 */

	pool = kmem_cache_alloc_node(sgv_pool_cachep, GFP_KERNEL, nodeid);
	if (pool == NULL) {
		PRINT_ERROR("Allocation of sgv_pool failed (size %zd)",
			sizeof(*pool));
		goto out;
	}
	memset(pool, 0, sizeof(*pool));

	mutex_lock(&sgv_pools_mutex);

	list_for_each_entry(tp, &sgv_pools_list, sgv_pools_list_entry) {
		if (strcmp(tp->name, name) == 0) {
			if (shared) {
				if (tp->owner_mm != current->mm) {
					PRINT_ERROR("Attempt of a shared use "
						"of SGV pool %s with "
						"different MM", name);
					goto out_free;
				}
				sgv_pool_get(tp);
				goto out_free;
			} else {
				PRINT_ERROR("SGV pool %s already exists", name);
				tp = NULL;
				goto out_free;
			}
		}
	}
	tp = NULL;

	rc = sgv_pool_init(pool, name, clustering_type, single_alloc_pages,
				purge_interval, nodeid != NUMA_NO_NODE);
	if (rc != 0)
		goto out_free;

out_unlock:
	mutex_unlock(&sgv_pools_mutex);

out:
	TRACE_EXIT_RES(pool != NULL);
	return pool;

out_free:
	kmem_cache_free(sgv_pool_cachep, pool);
	pool = tp;
	goto out_unlock;
}
EXPORT_SYMBOL_GPL(sgv_pool_create_node);

/*
 * sgv_pool_get - increase ref counter for the corresponding SGV pool
 *
 * Increases ref counter for the corresponding SGV pool
 */
void sgv_pool_get(struct sgv_pool *pool)
{
	atomic_inc(&pool->sgv_pool_ref);
	TRACE_MEM("Incrementing sgv pool %p ref (new value %d)",
		pool, atomic_read(&pool->sgv_pool_ref));
	return;
}
EXPORT_SYMBOL_GPL(sgv_pool_get);

/*
 * sgv_pool_put - decrease ref counter for the corresponding SGV pool
 *
 * Decreases ref counter for the corresponding SGV pool. If the ref
 * counter reaches 0, the cache will be destroyed.
 */
void sgv_pool_put(struct sgv_pool *pool)
{
	TRACE_MEM("Decrementing sgv pool %p ref (new value %d)",
		pool, atomic_read(&pool->sgv_pool_ref)-1);
	if (atomic_dec_and_test(&pool->sgv_pool_ref))
		sgv_pool_destroy(pool);
	return;
}
EXPORT_SYMBOL_GPL(sgv_pool_put);

/**
 * sgv_pool_del - deletes the corresponding SGV pool
 * @pool:	the cache to delete.
 *
 * Description:
 *    If the cache is shared, it will decrease its reference counter.
 *    If the reference counter reaches 0, the cache will be destroyed.
 */
void sgv_pool_del(struct sgv_pool *pool)
{
	TRACE_ENTRY();

	sgv_pool_put(pool);

	TRACE_EXIT();
	return;
}
EXPORT_SYMBOL_GPL(sgv_pool_del);

/* Both parameters in pages */
int scst_sgv_pools_init(unsigned long mem_hwmark, unsigned long mem_lwmark)
{
	int res = 0, i;

	TRACE_ENTRY();

	sgv_pool_cachep = KMEM_CACHE(sgv_pool, SCST_SLAB_FLAGS|SLAB_HWCACHE_ALIGN);
	if (sgv_pool_cachep == NULL)
		goto out_err;

	sgv_hi_wmk = mem_hwmark;
	sgv_lo_wmk = mem_lwmark;

	sgv_evaluate_local_max_pages();

	sgv_norm_pool_main = sgv_pool_create("sgv", sgv_no_clustering, 0, false, 0);
	if (sgv_norm_pool_main == NULL)
		goto out_free_pool;

	sgv_norm_clust_pool_main = sgv_pool_create("sgv-clust",
		sgv_full_clustering, 0, false, 0);
	if (sgv_norm_clust_pool_main == NULL)
		goto out_free_norm;

	sgv_dma_pool_main = sgv_pool_create("sgv-dma", sgv_no_clustering, 0,
				false, 0);
	if (sgv_dma_pool_main == NULL)
		goto out_free_clust;

	/*
	 * ToDo: not compatible with CPU hotplug! Notification
	 * callbacks must be installed!
	 */

	for (i = 0; i < nr_cpu_ids; i++)
		sgv_norm_pool_global[i] = sgv_norm_pool_main;

	for (i = 0; i < nr_cpu_ids; i++)
		sgv_norm_clust_pool_global[i] = sgv_norm_clust_pool_main;

	for (i = 0; i < nr_cpu_ids; i++)
		sgv_dma_pool_global[i] = sgv_dma_pool_main;

	for (i = 0; i < nr_cpu_ids; i++) {
		char name[60];

		if (!cpu_online(i))
			continue;
		scnprintf(name, sizeof(name), "sgv-%d", i);
		sgv_norm_pool_per_cpu[i] = sgv_pool_create_node(name,
			sgv_no_clustering, 0, false, 0, cpu_to_node(i));
		if (sgv_norm_pool_per_cpu[i] == NULL)
			goto out_free_per_cpu_norm;
	}

	for (i = 0; i < nr_cpu_ids; i++) {
		char name[60];

		if (!cpu_online(i))
			continue;
		scnprintf(name, sizeof(name), "sgv-clust-%d", i);
		sgv_norm_clust_pool_per_cpu[i] = sgv_pool_create_node(name,
			sgv_full_clustering, 0, false, 0, cpu_to_node(i));
		if (sgv_norm_clust_pool_per_cpu[i] == NULL)
			goto out_free_per_cpu_clust;
	}

	for (i = 0; i < nr_cpu_ids; i++) {
		char name[60];

		if (!cpu_online(i))
			continue;
		scnprintf(name, sizeof(name), "sgv-dma-%d", i);
		sgv_dma_pool_per_cpu[i] = sgv_pool_create_node(name,
			sgv_no_clustering, 0, false, 0, cpu_to_node(i));
		if (sgv_dma_pool_per_cpu[i] == NULL)
			goto out_free_per_cpu_dma;
	}

#if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23))
	sgv_shrinker = set_shrinker(DEFAULT_SEEKS, sgv_shrink);
#else
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 12, 0)
	sgv_shrinker.count_objects = sgv_can_be_shrunk;
	sgv_shrinker.scan_objects = sgv_scan_shrink;
#else
	sgv_shrinker.shrink = sgv_shrink;
#endif
	sgv_shrinker.seeks = DEFAULT_SEEKS;
	register_shrinker(&sgv_shrinker);
#endif

out:
	TRACE_EXIT_RES(res);
	return res;

out_free_per_cpu_dma:
	for (i = 0; i < nr_cpu_ids; i++)
		if (sgv_dma_pool_per_cpu[i] != NULL)
			sgv_pool_destroy(sgv_dma_pool_per_cpu[i]);

out_free_per_cpu_clust:
	for (i = 0; i < nr_cpu_ids; i++)
		if (sgv_norm_clust_pool_per_cpu[i] != NULL)
			sgv_pool_destroy(sgv_norm_clust_pool_per_cpu[i]);

out_free_per_cpu_norm:
	for (i = 0; i < nr_cpu_ids; i++)
		if (sgv_norm_pool_per_cpu[i] != NULL)
			sgv_pool_destroy(sgv_norm_pool_per_cpu[i]);

	sgv_pool_destroy(sgv_dma_pool_main);

out_free_clust:
	sgv_pool_destroy(sgv_norm_clust_pool_main);

out_free_norm:
	sgv_pool_destroy(sgv_norm_pool_main);

out_free_pool:
	kmem_cache_destroy(sgv_pool_cachep);

out_err:
	res = -ENOMEM;
	goto out;
}

void scst_sgv_pools_deinit(void)
{
	int i;

	TRACE_ENTRY();

#if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23))
	remove_shrinker(sgv_shrinker);
#else
	unregister_shrinker(&sgv_shrinker);
#endif

	sgv_pool_destroy(sgv_dma_pool_main);
	for (i = 0; i < nr_cpu_ids; i++)
		if (sgv_dma_pool_per_cpu[i] != NULL)
			sgv_pool_destroy(sgv_dma_pool_per_cpu[i]);

	sgv_pool_destroy(sgv_norm_pool_main);
	for (i = 0; i < nr_cpu_ids; i++)
		if (sgv_norm_pool_per_cpu[i] != NULL)
			sgv_pool_destroy(sgv_norm_pool_per_cpu[i]);

	sgv_pool_destroy(sgv_norm_clust_pool_main);
	for (i = 0; i < nr_cpu_ids; i++)
		if (sgv_norm_clust_pool_per_cpu[i] != NULL)
			sgv_pool_destroy(sgv_norm_clust_pool_per_cpu[i]);

	for (i = 0; i < nr_cpu_ids; i++)
		sgv_norm_pool_global[i] = NULL;

	for (i = 0; i < nr_cpu_ids; i++)
		sgv_norm_clust_pool_global[i] = NULL;

	for (i = 0; i < nr_cpu_ids; i++)
		sgv_dma_pool_global[i] = NULL;

	kmem_cache_destroy(sgv_pool_cachep);

	TRACE_EXIT();
	return;
}


static ssize_t sgv_sysfs_stat_show(struct kobject *kobj,
	struct kobj_attribute *attr, char *buf)
{
	struct sgv_pool *pool;
	int i, total = 0, hit = 0, merged = 0, allocated = 0;
	int oa, om, res;

	pool = container_of(kobj, struct sgv_pool, sgv_kobj);

	for (i = 0; i < SGV_POOL_ELEMENTS; i++) {
		int t;

		hit += atomic_read(&pool->cache_acc[i].hit_alloc);
		total += atomic_read(&pool->cache_acc[i].total_alloc);

		t = atomic_read(&pool->cache_acc[i].total_alloc) -
			atomic_read(&pool->cache_acc[i].hit_alloc);
		allocated += t * (1 << i);
		merged += atomic_read(&pool->cache_acc[i].merged);
	}

	res = sprintf(buf, "%-30s %-11s %-11s %-11s %-11s", "Name", "Hit", "Total",
		"% merged", "Cached (P/I/O)");

	res += sprintf(&buf[res], "\n%-30s %-11d %-11d %-11d %d/%d/%d\n",
		pool->name, hit, total,
		(allocated != 0) ? merged*100/allocated : 0,
		pool->cached_pages, pool->inactive_cached_pages,
		pool->cached_entries);

	for (i = 0; i < SGV_POOL_ELEMENTS; i++) {
		int t = atomic_read(&pool->cache_acc[i].total_alloc) -
			atomic_read(&pool->cache_acc[i].hit_alloc);
		allocated = t * (1 << i);
		merged = atomic_read(&pool->cache_acc[i].merged);

		res += sprintf(&buf[res], "  %-28s %-11d %-11d %d\n",
			pool->cache_names[i],
			atomic_read(&pool->cache_acc[i].hit_alloc),
			atomic_read(&pool->cache_acc[i].total_alloc),
			(allocated != 0) ? merged*100/allocated : 0);
	}

	allocated = atomic_read(&pool->big_pages);
	merged = atomic_read(&pool->big_merged);
	oa = atomic_read(&pool->other_pages);
	om = atomic_read(&pool->other_merged);

	res += sprintf(&buf[res], "  %-40s %d/%-9d %d/%d\n", "big/other",
		atomic_read(&pool->big_alloc), atomic_read(&pool->other_alloc),
		(allocated != 0) ? merged*100/allocated : 0,
		(oa != 0) ? om/oa : 0);

	return res;
}

static ssize_t sgv_sysfs_stat_reset(struct kobject *kobj,
	struct kobj_attribute *attr, const char *buf, size_t count)
{
	struct sgv_pool *pool;
	int i;

	TRACE_ENTRY();

	pool = container_of(kobj, struct sgv_pool, sgv_kobj);

	for (i = 0; i < SGV_POOL_ELEMENTS; i++) {
		atomic_set(&pool->cache_acc[i].hit_alloc, 0);
		atomic_set(&pool->cache_acc[i].total_alloc, 0);
		atomic_set(&pool->cache_acc[i].merged, 0);
	}

	atomic_set(&pool->big_pages, 0);
	atomic_set(&pool->big_merged, 0);
	atomic_set(&pool->big_alloc, 0);
	atomic_set(&pool->other_pages, 0);
	atomic_set(&pool->other_merged, 0);
	atomic_set(&pool->other_alloc, 0);

	PRINT_INFO("Statistics for SGV pool %s reset", pool->name);

	TRACE_EXIT_RES(count);
	return count;
}

static ssize_t sgv_sysfs_global_stat_show(struct kobject *kobj,
	struct kobj_attribute *attr, char *buf)
{
	struct sgv_pool *pool;
	int inactive_pages = 0, res;

	TRACE_ENTRY();

	spin_lock_bh(&sgv_pools_lock);
	list_for_each_entry(pool, &sgv_pools_list, sgv_pools_list_entry) {
		inactive_pages += pool->inactive_cached_pages;
	}
	spin_unlock_bh(&sgv_pools_lock);

#ifdef CONFIG_SCST_NO_TOTAL_MEM_CHECKS
	res = sprintf(buf, "%-42s %d\n", "Inactive pages", inactive_pages);
#else
	res = sprintf(buf, "%-42s %d/%d\n%-42s %d/%d\n%-42s %d/%d\n"
		"%-42s %-11d\n",
		"Inactive/active pages", inactive_pages,
		atomic_read(&sgv_pages_total) - inactive_pages,
		"Hi/lo watermarks [pages]", sgv_hi_wmk, sgv_lo_wmk,
		"Hi watermark releases/failures",
		atomic_read(&sgv_releases_on_hiwmk),
		atomic_read(&sgv_releases_on_hiwmk_failed),
		"Other allocs", atomic_read(&sgv_other_total_alloc));
#endif

	TRACE_EXIT();
	return res;
}

static ssize_t sgv_sysfs_global_stat_reset(struct kobject *kobj,
	struct kobj_attribute *attr, const char *buf, size_t count)
{
	TRACE_ENTRY();

	atomic_set(&sgv_releases_on_hiwmk, 0);
	atomic_set(&sgv_releases_on_hiwmk_failed, 0);
#ifndef CONFIG_SCST_NO_TOTAL_MEM_CHECKS
	atomic_set(&sgv_other_total_alloc, 0);
#endif

	PRINT_INFO("%s", "Global SGV pool statistics reset");

	TRACE_EXIT_RES(count);
	return count;
}

static struct kobj_attribute sgv_stat_attr =
	__ATTR(stats, S_IRUGO | S_IWUSR, sgv_sysfs_stat_show,
		sgv_sysfs_stat_reset);

static struct attribute *sgv_attrs[] = {
	&sgv_stat_attr.attr,
	NULL,
};

static void sgv_kobj_release(struct kobject *kobj)
{
	struct sgv_pool *pool;

	TRACE_ENTRY();

	pool = container_of(kobj, struct sgv_pool, sgv_kobj);
	if (pool->sgv_kobj_release_cmpl != NULL)
		complete_all(pool->sgv_kobj_release_cmpl);

	TRACE_EXIT();
	return;
}

static struct kobj_type sgv_pool_ktype = {
	.sysfs_ops = &scst_sysfs_ops,
	.release = sgv_kobj_release,
	.default_attrs = sgv_attrs,
};

static int scst_sgv_sysfs_create(struct sgv_pool *pool)
{
	int res;

	TRACE_ENTRY();

	res = kobject_init_and_add(&pool->sgv_kobj, &sgv_pool_ktype,
			scst_sgv_kobj, pool->name);
	if (res != 0) {
		PRINT_ERROR("Can't add sgv pool %s to sysfs", pool->name);
		goto out;
	}

out:
	TRACE_EXIT_RES(res);
	return res;
}

static void scst_sgv_sysfs_del(struct sgv_pool *pool)
{
	DECLARE_COMPLETION_ONSTACK(c);

	TRACE_ENTRY();

	pool->sgv_kobj_release_cmpl = &c;

	kobject_del(&pool->sgv_kobj);

	SCST_KOBJECT_PUT_AND_WAIT(&pool->sgv_kobj, "SGV pool", &c,
				  &scst_pool_dep_map);

	TRACE_EXIT();
}

static struct kobj_attribute sgv_global_stat_attr =
	__ATTR(global_stats, S_IRUGO | S_IWUSR, sgv_sysfs_global_stat_show,
		sgv_sysfs_global_stat_reset);

static struct attribute *sgv_default_attrs[] = {
	&sgv_global_stat_attr.attr,
	NULL,
};

static void scst_sysfs_release(struct kobject *kobj)
{
	kfree(kobj);
}

static struct kobj_type sgv_ktype = {
	.sysfs_ops = &scst_sysfs_ops,
	.release = scst_sysfs_release,
	.default_attrs = sgv_default_attrs,
};

/*
 * scst_add_sgv_kobj() - Initialize and add the root SGV kernel object.
 */
int scst_add_sgv_kobj(struct kobject *parent, const char *name)
{
	int res;

	WARN_ON(scst_sgv_kobj);
	res = -ENOMEM;
	scst_sgv_kobj = kzalloc(sizeof(*scst_sgv_kobj), GFP_KERNEL);
	if (!scst_sgv_kobj)
		goto out;
	res = kobject_init_and_add(scst_sgv_kobj, &sgv_ktype, parent, name);
	if (res != 0)
		goto out_free;
out:
	return res;
out_free:
	kobject_put(scst_sgv_kobj);
	scst_sgv_kobj = NULL;
	goto out;
}

/**
 * scst_del_put_sgv_kobj() - Remove the root SGV kernel object.
 */
void scst_del_put_sgv_kobj(void)
{
	WARN_ON(!scst_sgv_kobj);
	kobject_del(scst_sgv_kobj);
	kobject_put(scst_sgv_kobj);
	scst_sgv_kobj = NULL;
}