/*
 * Copyright 2014-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in
 *       the documentation and/or other materials provided with the
 *       distribution.
 *
 *     * Neither the name of the copyright holder nor the names of its
 *       contributors may be used to endorse or promote products derived
 *       from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * Some changes and fixes made by:
 *	Boaz Harrosh <boazh@netapp.com>
 */

#include <cpuid.h>

#include "movnt.h"

#define EAX_IDX 0
#define EBX_IDX 1
#define ECX_IDX 2
#define EDX_IDX 3

#define CLFLUSHOPT_FUNC		0x7
#define CLFLUSHOPT_BIT		(1 << 23)

#define CACHELINE_ALIGN ((uintptr_t)64)
#define CACHELINE_MASK	(CACHELINE_ALIGN - 1)

#define	CHUNK_SIZE	128 /* 16*8 */
#define	CHUNK_SHIFT	7
#define	CHUNK_MASK	(CHUNK_SIZE - 1)

#define	DWORD_SIZE	4
#define	DWORD_SHIFT	2
#define	DWORD_MASK	(DWORD_SIZE - 1)

#define	MOVNT_SIZE	16
#define	MOVNT_MASK	(MOVNT_SIZE - 1)
#define	MOVNT_SHIFT	4

#define	MOVNT_THRESHOLD	256

/*
 * flush_clflush -- (internal) flush the CPU cache, using clflush
 * (Boaz: Is only used here for the none aligned tails of movnt, clflush
 *  Is always better than clflushopt in this case, even if clflushopt is
 *  available)
 */
static void
flush_clflush(const void *addr, size_t len)
{
	uintptr_t uptr;

	/*
	 * Loop through cache-line-size (typically 64B) aligned chunks
	 * covering the given range.
	 */
	for (uptr = (uintptr_t)addr & ~(CACHELINE_ALIGN - 1);
		uptr < (uintptr_t)addr + len; uptr += CACHELINE_ALIGN)
		_mm_clflush((char *)uptr);
}

static void
pmem_flush(const void *addr, size_t len)
{
	flush_clflush(addr, len);
}

/*
 * memmove_nodrain_movnt -- (internal) memmove to pmem without hw drain, movnt
 */
static void *
memmove_nodrain_movnt(void *pmemdest, const void *src, size_t len)
{
	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
	size_t i;
	__m128i *d;
	const __m128i *s;
	void *dest1 = pmemdest;
	size_t cnt;

	if (len == 0 || src == pmemdest)
		return pmemdest;

	if (len < MOVNT_THRESHOLD) {
		memmove(pmemdest, src, len);
		pmem_flush(pmemdest, len);
		return pmemdest;
	}

	if ((uintptr_t)dest1 - (uintptr_t)src >= len) {
		/*
		 * Copy the range in the forward direction.
		 *
		 * This is the most common, most optimized case, used unless
		 * the overlap specifically prevents it.
		 */

		/* copy up to CACHELINE_ALIGN boundary */
		cnt = (uint64_t)dest1 & CACHELINE_MASK;
		if (cnt > 0) {
			uint8_t *d8;
			const uint8_t *s8;

			cnt = CACHELINE_ALIGN - cnt;

			/* never try to copy more the len bytes */
			if (cnt > len)
				cnt = len;

			d8 = dest1;
			s8 = src;
			for (i = 0; i < cnt; i++) {
				*d8 = *s8;
				d8++;
				s8++;
			}
			pmem_flush(dest1, cnt);
			dest1 += cnt;
			src += cnt;
			len -= cnt;
		}

		d = dest1;
		s = src;

		cnt = len >> CHUNK_SHIFT;
		for (i = 0; i < cnt; i++) {
			xmm0 = _mm_loadu_si128(s);
			xmm1 = _mm_loadu_si128(s + 1);
			xmm2 = _mm_loadu_si128(s + 2);
			xmm3 = _mm_loadu_si128(s + 3);
			xmm4 = _mm_loadu_si128(s + 4);
			xmm5 = _mm_loadu_si128(s + 5);
			xmm6 = _mm_loadu_si128(s + 6);
			xmm7 = _mm_loadu_si128(s + 7);
			s += 8;
			_mm_stream_si128(d,	xmm0);
			_mm_stream_si128(d + 1,	xmm1);
			_mm_stream_si128(d + 2,	xmm2);
			_mm_stream_si128(d + 3,	xmm3);
			_mm_stream_si128(d + 4,	xmm4);
			_mm_stream_si128(d + 5, xmm5);
			_mm_stream_si128(d + 6,	xmm6);
			_mm_stream_si128(d + 7,	xmm7);
			d += 8;
		}

		/* copy the tail (<128 bytes) in 16 bytes chunks */
		len &= CHUNK_MASK;
		if (len != 0) {
			cnt = len >> MOVNT_SHIFT;
			for (i = 0; i < cnt; i++) {
				xmm0 = _mm_loadu_si128(s);
				_mm_stream_si128(d, xmm0);
				s++;
				d++;
			}
		}

		/* copy the last bytes (<16), first dwords then bytes */
		len &= MOVNT_MASK;
		if (len != 0) {
			int32_t *d32 = (int32_t *)d;
			const int32_t *s32 = (const int32_t *)s;
			uint8_t *d8;
			const uint8_t *s8;

			cnt = len >> DWORD_SHIFT;
			for (i = 0; i < cnt; i++) {
				_mm_stream_si32(d32, *s32);
				d32++;
				s32++;
			}
			cnt = len & DWORD_MASK;
			d8 = (uint8_t *)d32;
			s8 = (const uint8_t *)s32;

			for (i = 0; i < cnt; i++) {
				*d8 = *s8;
				d8++;
				s8++;
			}
			pmem_flush(d32, cnt);
		}
	} else {
		/*
		 * Copy the range in the backward direction.
		 *
		 * This prevents overwriting source data due to an
		 * overlapped destination range.
		 */

		dest1 += len;
		src += len;

		cnt = (uint64_t)dest1 & CACHELINE_MASK;
		if (cnt > 0) {
			uint8_t *d8;
			const uint8_t *s8;

			/* never try to copy more the len bytes */
			if (cnt > len)
				cnt = len;

			d8 = dest1;
			s8 = src;
			for (i = 0; i < cnt; i++) {
				d8--;
				s8--;
				*d8 = *s8;
			}
			pmem_flush(d8, cnt);
			dest1 = (char *)dest1 - cnt;
			src = (const char *)src - cnt;
			len -= cnt;
		}

		d = (__m128i *)dest1;
		s = (const __m128i *)src;

		cnt = len >> CHUNK_SHIFT;
		for (i = 0; i < cnt; i++) {
			xmm0 = _mm_loadu_si128(s - 1);
			xmm1 = _mm_loadu_si128(s - 2);
			xmm2 = _mm_loadu_si128(s - 3);
			xmm3 = _mm_loadu_si128(s - 4);
			xmm4 = _mm_loadu_si128(s - 5);
			xmm5 = _mm_loadu_si128(s - 6);
			xmm6 = _mm_loadu_si128(s - 7);
			xmm7 = _mm_loadu_si128(s - 8);
			s -= 8;
			_mm_stream_si128(d - 1, xmm0);
			_mm_stream_si128(d - 2, xmm1);
			_mm_stream_si128(d - 3, xmm2);
			_mm_stream_si128(d - 4, xmm3);
			_mm_stream_si128(d - 5, xmm4);
			_mm_stream_si128(d - 6, xmm5);
			_mm_stream_si128(d - 7, xmm6);
			_mm_stream_si128(d - 8, xmm7);
			d -= 8;
		}

		/* copy the tail (<128 bytes) in 16 bytes chunks */
		len &= CHUNK_MASK;
		if (len != 0) {
			cnt = len >> MOVNT_SHIFT;
			for (i = 0; i < cnt; i++) {
				d--;
				s--;
				xmm0 = _mm_loadu_si128(s);
				_mm_stream_si128(d, xmm0);
			}
		}

		/* copy the last bytes (<16), first dwords then bytes */
		len &= MOVNT_MASK;
		if (len != 0) {
			int32_t *d32 = (int32_t *)d;
			const int32_t *s32 = (const int32_t *)s;
			uint8_t *d8;
			const uint8_t *s8;

			cnt = len >> DWORD_SHIFT;

			for (i = 0; i < cnt; i++) {
				d32--;
				s32--;
				_mm_stream_si32(d32, *s32);
			}

			cnt = len & DWORD_MASK;
			d8 = (uint8_t *)d32;
			s8 = (const uint8_t *)s32;

			for (i = 0; i < cnt; i++) {
				d8--;
				s8--;
				*d8 = *s8;
			}
			pmem_flush(d8, cnt);
		}
	}

	/* serialize non-temporal store instructions */
	_mm_sfence();

	return pmemdest;
}

/*
 * pmem_memmove_persist -- memmove to pmem
 */
void *
pmem_memmove_persist(void *pmemdest, const void *src, size_t len)
{
	memmove_nodrain_movnt(pmemdest, src, len);

	return pmemdest;
}

static inline void
cpuid(unsigned func, unsigned subfunc, unsigned cpuinfo[4])
{
	__cpuid_count(func, subfunc, cpuinfo[EAX_IDX], cpuinfo[EBX_IDX],
		      cpuinfo[ECX_IDX], cpuinfo[EDX_IDX]);
}

static int cpuid_check(unsigned func, unsigned reg, unsigned bit)
{
	unsigned int cpuinfo[4] = {};

	/* func check */
	cpuid(0x0, 0x0, cpuinfo);
	if (cpuinfo[EAX_IDX] < func)
		return 0;

	cpuid(func, 0x0, cpuinfo);

	return (cpuinfo[reg] & bit) != 0;
}

static int clflushopt_avail(void)
{
	return cpuid_check(CLFLUSHOPT_FUNC, EBX_IDX, CLFLUSHOPT_BIT);
}

/* Old processors don't support clflushopt, so we default to clflush */
void (*cl_flush_opt)(void *buf, uint32_t len) = cl_flush;

__attribute__((constructor))
static void clflush_init(void) {
	if (clflushopt_avail())
		cl_flush_opt = __cl_flush_opt;
}