1 /* Taken from Linux arch/arm */
2 #ifndef __ASM_ARM_DIV64
3 #define __ASM_ARM_DIV64
4 
5 #include <asm/system.h>
6 #include <xen/types.h>
7 
8 /*
9  * The semantics of do_div() are:
10  *
11  * uint32_t do_div(uint64_t *n, uint32_t base)
12  * {
13  * 	uint32_t remainder = *n % base;
14  * 	*n = *n / base;
15  * 	return remainder;
16  * }
17  *
18  * In other words, a 64-bit dividend with a 32-bit divisor producing
19  * a 64-bit result and a 32-bit remainder.  To accomplish this optimally
20  * we call a special __do_div64 helper with completely non standard
21  * calling convention for arguments and results (beware).
22  */
23 
24 
25 #if BITS_PER_LONG == 64
26 
27 # define do_div(n,base) ({                                      \
28         uint32_t __base = (base);                               \
29         uint32_t __rem;                                         \
30         __rem = ((uint64_t)(n)) % __base;                       \
31         (n) = ((uint64_t)(n)) / __base;                         \
32         __rem;                                                  \
33  })
34 
35 #elif BITS_PER_LONG == 32
36 
37 #ifdef __ARMEB__
38 #define __xh "r0"
39 #define __xl "r1"
40 #else
41 #define __xl "r0"
42 #define __xh "r1"
43 #endif
44 
45 #define __do_div_asm(n, base)					\
46 ({								\
47 	register unsigned int __base      asm("r4") = base;	\
48 	register unsigned long long __n   asm("r0") = n;	\
49 	register unsigned long long __res asm("r2");		\
50 	register unsigned int __rem       asm(__xh);		\
51 	asm(	__asmeq("%0", __xh)				\
52 		__asmeq("%1", "r2")				\
53 		__asmeq("%2", "r0")				\
54 		__asmeq("%3", "r4")				\
55 		"bl	__do_div64"				\
56 		: "=r" (__rem), "=r" (__res)			\
57 		: "r" (__n), "r" (__base)			\
58 		: "ip", "lr", "cc");				\
59 	n = __res;						\
60 	__rem;							\
61 })
62 
63 #if __GNUC__ < 4
64 
65 /*
66  * gcc versions earlier than 4.0 are simply too problematic for the
67  * optimized implementation below. First there is gcc PR 15089 that
68  * tend to trig on more complex constructs, spurious .global __udivsi3
69  * are inserted even if none of those symbols are referenced in the
70  * generated code, and those gcc versions are not able to do constant
71  * propagation on long long values anyway.
72  */
73 #define do_div(n, base) __do_div_asm(n, base)
74 
75 #elif __GNUC__ >= 4
76 
77 #include <asm/bug.h>
78 
79 /*
80  * If the divisor happens to be constant, we determine the appropriate
81  * inverse at compile time to turn the division into a few inline
82  * multiplications instead which is much faster. And yet only if compiling
83  * for ARMv4 or higher (we need umull/umlal) and if the gcc version is
84  * sufficiently recent to perform proper long long constant propagation.
85  * (It is unfortunate that gcc doesn't perform all this internally.)
86  */
87 #define do_div(n, base)							\
88 ({									\
89 	unsigned int __r, __b = (base);					\
90 	if (!__builtin_constant_p(__b) || __b == 0) {			\
91 		/* non-constant divisor (or zero): slow path */		\
92 		__r = __do_div_asm(n, __b);				\
93 	} else if ((__b & (__b - 1)) == 0) {				\
94 		/* Trivial: __b is constant and a power of 2 */		\
95 		/* gcc does the right thing with this code.  */		\
96 		__r = n;						\
97 		__r &= (__b - 1);					\
98 		n /= __b;						\
99 	} else {							\
100 		/* Multiply by inverse of __b: n/b = n*(p/b)/p       */	\
101 		/* We rely on the fact that most of this code gets   */	\
102 		/* optimized away at compile time due to constant    */	\
103 		/* propagation and only a couple inline assembly     */	\
104 		/* instructions should remain. Better avoid any      */	\
105 		/* code construct that might prevent that.           */	\
106 		unsigned long long __res, __x, __t, __m, __n = n;	\
107 		unsigned int __c, __p, __z = 0;				\
108 		/* preserve low part of n for reminder computation */	\
109 		__r = __n;						\
110 		/* determine number of bits to represent __b */		\
111 		__p = 1 << __div64_fls(__b);				\
112 		/* compute __m = ((__p << 64) + __b - 1) / __b */	\
113 		__m = (~0ULL / __b) * __p;				\
114 		__m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b;	\
115 		/* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */	\
116 		__x = ~0ULL / __b * __b - 1;				\
117 		__res = (__m & 0xffffffff) * (__x & 0xffffffff);	\
118 		__res >>= 32;						\
119 		__res += (__m & 0xffffffff) * (__x >> 32);		\
120 		__t = __res;						\
121 		__res += (__x & 0xffffffff) * (__m >> 32);		\
122 		__t = (__res < __t) ? (1ULL << 32) : 0;			\
123 		__res = (__res >> 32) + __t;				\
124 		__res += (__m >> 32) * (__x >> 32);			\
125 		__res /= __p;						\
126 		/* Now sanitize and optimize what we've got. */		\
127 		if (~0ULL % (__b / (__b & -__b)) == 0) {		\
128 			/* those cases can be simplified with: */	\
129 			__n /= (__b & -__b);				\
130 			__m = ~0ULL / (__b / (__b & -__b));		\
131 			__p = 1;					\
132 			__c = 1;					\
133 		} else if (__res != __x / __b) {			\
134 			/* We can't get away without a correction    */	\
135 			/* to compensate for bit truncation errors.  */	\
136 			/* To avoid it we'd need an additional bit   */	\
137 			/* to represent __m which would overflow it. */	\
138 			/* Instead we do m=p/b and n/b=(n*m+m)/p.    */	\
139 			__c = 1;					\
140 			/* Compute __m = (__p << 64) / __b */		\
141 			__m = (~0ULL / __b) * __p;			\
142 			__m += ((~0ULL % __b + 1) * __p) / __b;		\
143 		} else {						\
144 			/* Reduce __m/__p, and try to clear bit 31   */	\
145 			/* of __m when possible otherwise that'll    */	\
146 			/* need extra overflow handling later.       */	\
147 			unsigned int __bits = -(__m & -__m);		\
148 			__bits |= __m >> 32;				\
149 			__bits = (~__bits) << 1;			\
150 			/* If __bits == 0 then setting bit 31 is     */	\
151 			/* unavoidable.  Simply apply the maximum    */	\
152 			/* possible reduction in that case.          */	\
153 			/* Otherwise the MSB of __bits indicates the */	\
154 			/* best reduction we should apply.           */	\
155 			if (!__bits) {					\
156 				__p /= (__m & -__m);			\
157 				__m /= (__m & -__m);			\
158 			} else {					\
159 				__p >>= __div64_fls(__bits);		\
160 				__m >>= __div64_fls(__bits);		\
161 			}						\
162 			/* No correction needed. */			\
163 			__c = 0;					\
164 		}							\
165 		/* Now we have a combination of 2 conditions:        */	\
166 		/* 1) whether or not we need a correction (__c), and */	\
167 		/* 2) whether or not there might be an overflow in   */	\
168 		/*    the cross product (__m & ((1<<63) | (1<<31)))  */	\
169 		/* Select the best insn combination to perform the   */	\
170 		/* actual __m * __n / (__p << 64) operation.         */	\
171 		if (!__c) {						\
172 			asm (	"umull	%Q0, %R0, %1, %Q2\n\t"		\
173 				"mov	%Q0, #0"			\
174 				: "=&r" (__res)				\
175 				: "r" (__m), "r" (__n)			\
176 				: "cc" );				\
177 		} else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {	\
178 			__res = __m;					\
179 			asm (	"umlal	%Q0, %R0, %Q1, %Q2\n\t"		\
180 				"mov	%Q0, #0"			\
181 				: "+&r" (__res)				\
182 				: "r" (__m), "r" (__n)			\
183 				: "cc" );				\
184 		} else {						\
185 			asm (	"umull	%Q0, %R0, %Q1, %Q2\n\t"		\
186 				"cmn	%Q0, %Q1\n\t"			\
187 				"adcs	%R0, %R0, %R1\n\t"		\
188 				"adc	%Q0, %3, #0"			\
189 				: "=&r" (__res)				\
190 				: "r" (__m), "r" (__n), "r" (__z)	\
191 				: "cc" );				\
192 		}							\
193 		if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {		\
194 			asm (	"umlal	%R0, %Q0, %R1, %Q2\n\t"		\
195 				"umlal	%R0, %Q0, %Q1, %R2\n\t"		\
196 				"mov	%R0, #0\n\t"			\
197 				"umlal	%Q0, %R0, %R1, %R2"		\
198 				: "+&r" (__res)				\
199 				: "r" (__m), "r" (__n)			\
200 				: "cc" );				\
201 		} else {						\
202 			asm (	"umlal	%R0, %Q0, %R2, %Q3\n\t"		\
203 				"umlal	%R0, %1, %Q2, %R3\n\t"		\
204 				"mov	%R0, #0\n\t"			\
205 				"adds	%Q0, %1, %Q0\n\t"		\
206 				"adc	%R0, %R0, #0\n\t"		\
207 				"umlal	%Q0, %R0, %R2, %R3"		\
208 				: "+&r" (__res), "+&r" (__z)		\
209 				: "r" (__m), "r" (__n)			\
210 				: "cc" );				\
211 		}							\
212 		__res /= __p;						\
213 		/* The reminder can be computed with 32-bit regs     */	\
214 		/* only, and gcc is good at that.                    */	\
215 		{							\
216 			unsigned int __res0 = __res;			\
217 			unsigned int __b0 = __b;			\
218 			__r -= __res0 * __b0;				\
219 		}							\
220 		/* BUG_ON(__r >= __b || __res * __b + __r != n); */	\
221 		n = __res;						\
222 	}								\
223 	__r;								\
224 })
225 
226 /* our own fls implementation to make sure constant propagation is fine */
227 #define __div64_fls(bits)						\
228 ({									\
229 	unsigned int __left = (bits), __nr = 0;				\
230 	if (__left & 0xffff0000) __nr += 16, __left >>= 16;		\
231 	if (__left & 0x0000ff00) __nr +=  8, __left >>=  8;		\
232 	if (__left & 0x000000f0) __nr +=  4, __left >>=  4;		\
233 	if (__left & 0x0000000c) __nr +=  2, __left >>=  2;		\
234 	if (__left & 0x00000002) __nr +=  1;				\
235 	__nr;								\
236 })
237 
238 #endif /* GCC version */
239 
240 #endif /* BITS_PER_LONG */
241 
242 #endif
243 /*
244  * Local variables:
245  * mode: C
246  * c-file-style: "BSD"
247  * c-basic-offset: 4
248  * indent-tabs-mode: nil
249  * End:
250  */
251