1/* SPDX-License-Identifier: GPL-2.0 */
2// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
3
4#include <linux/linkage.h>
5
6.macro	GET_FRONT_BITS rx y
7#ifdef	__cskyLE__
8	lsri	\rx, \y
9#else
10	lsli	\rx, \y
11#endif
12.endm
13
14.macro	GET_AFTER_BITS rx y
15#ifdef	__cskyLE__
16	lsli	\rx, \y
17#else
18	lsri	\rx, \y
19#endif
20.endm
21
22/* void *memcpy(void *dest, const void *src, size_t n); */
23ENTRY(memcpy)
24	mov	r7, r2
25	cmplti	r4, 4
26	bt	.L_copy_by_byte
27	mov	r6, r2
28	andi	r6, 3
29	cmpnei	r6, 0
30	jbt	.L_dest_not_aligned
31	mov	r6, r3
32	andi	r6, 3
33	cmpnei	r6, 0
34	jbt	.L_dest_aligned_but_src_not_aligned
35.L0:
36	cmplti	r4, 16
37	jbt	.L_aligned_and_len_less_16bytes
38	subi	sp, 8
39	stw	r8, (sp, 0)
40.L_aligned_and_len_larger_16bytes:
41	ldw	r1, (r3, 0)
42	ldw	r5, (r3, 4)
43	ldw	r8, (r3, 8)
44	stw	r1, (r7, 0)
45	ldw	r1, (r3, 12)
46	stw	r5, (r7, 4)
47	stw	r8, (r7, 8)
48	stw	r1, (r7, 12)
49	subi	r4, 16
50	addi	r3, 16
51	addi	r7, 16
52	cmplti	r4, 16
53	jbf	.L_aligned_and_len_larger_16bytes
54	ldw	r8, (sp, 0)
55	addi	sp, 8
56	cmpnei	r4, 0
57	jbf	.L_return
58
59.L_aligned_and_len_less_16bytes:
60	cmplti	r4, 4
61	bt	.L_copy_by_byte
62.L1:
63	ldw	r1, (r3, 0)
64	stw	r1, (r7, 0)
65	subi	r4, 4
66	addi	r3, 4
67	addi	r7, 4
68	cmplti	r4, 4
69	jbf	.L1
70	br	.L_copy_by_byte
71
72.L_return:
73	rts
74
75.L_copy_by_byte:                      /* len less than 4 bytes */
76	cmpnei	r4, 0
77	jbf	.L_return
78.L4:
79	ldb	r1, (r3, 0)
80	stb	r1, (r7, 0)
81	addi	r3, 1
82	addi	r7, 1
83	decne	r4
84	jbt	.L4
85	rts
86
87/*
88 * If dest is not aligned, just copying some bytes makes the dest align.
89 * Afther that, we judge whether the src is aligned.
90 */
91.L_dest_not_aligned:
92	mov	r5, r3
93	rsub	r5, r5, r7
94	abs	r5, r5
95	cmplt	r5, r4
96	bt	.L_copy_by_byte
97	mov	r5, r7
98	sub	r5, r3
99	cmphs	r5, r4
100	bf	.L_copy_by_byte
101	mov	r5, r6
102.L5:
103	ldb	r1, (r3, 0)              /* makes the dest align. */
104	stb	r1, (r7, 0)
105	addi	r5, 1
106	subi	r4, 1
107	addi	r3, 1
108	addi	r7, 1
109	cmpnei	r5, 4
110	jbt	.L5
111	cmplti	r4, 4
112	jbt	.L_copy_by_byte
113	mov	r6, r3                   /* judge whether the src is aligned. */
114	andi	r6, 3
115	cmpnei	r6, 0
116	jbf	.L0
117
118/* Judge the number of misaligned, 1, 2, 3? */
119.L_dest_aligned_but_src_not_aligned:
120	mov	r5, r3
121	rsub	r5, r5, r7
122	abs	r5, r5
123	cmplt	r5, r4
124	bt	.L_copy_by_byte
125	bclri	r3, 0
126	bclri	r3, 1
127	ldw	r1, (r3, 0)
128	addi	r3, 4
129	cmpnei	r6, 2
130	bf	.L_dest_aligned_but_src_not_aligned_2bytes
131	cmpnei	r6, 3
132	bf	.L_dest_aligned_but_src_not_aligned_3bytes
133
134.L_dest_aligned_but_src_not_aligned_1byte:
135	mov	r5, r7
136	sub	r5, r3
137	cmphs	r5, r4
138	bf	.L_copy_by_byte
139	cmplti	r4, 16
140	bf	.L11
141.L10:                                     /* If the len is less than 16 bytes */
142	GET_FRONT_BITS r1 8
143	mov	r5, r1
144	ldw	r6, (r3, 0)
145	mov	r1, r6
146	GET_AFTER_BITS r6 24
147	or	r5, r6
148	stw	r5, (r7, 0)
149	subi	r4, 4
150	addi	r3, 4
151	addi	r7, 4
152	cmplti	r4, 4
153	bf	.L10
154	subi	r3, 3
155	br	.L_copy_by_byte
156.L11:
157	subi	sp, 16
158	stw	r8, (sp, 0)
159	stw	r9, (sp, 4)
160	stw	r10, (sp, 8)
161	stw	r11, (sp, 12)
162.L12:
163	ldw	r5, (r3, 0)
164	ldw	r11, (r3, 4)
165	ldw	r8, (r3, 8)
166	ldw	r9, (r3, 12)
167
168	GET_FRONT_BITS r1 8               /* little or big endian? */
169	mov	r10, r5
170	GET_AFTER_BITS r5 24
171	or	r5, r1
172
173	GET_FRONT_BITS r10 8
174	mov	r1, r11
175	GET_AFTER_BITS r11 24
176	or	r11, r10
177
178	GET_FRONT_BITS r1 8
179	mov	r10, r8
180	GET_AFTER_BITS r8 24
181	or	r8, r1
182
183	GET_FRONT_BITS r10 8
184	mov	r1, r9
185	GET_AFTER_BITS r9 24
186	or	r9, r10
187
188	stw	r5, (r7, 0)
189	stw	r11, (r7, 4)
190	stw	r8, (r7, 8)
191	stw	r9, (r7, 12)
192	subi	r4, 16
193	addi	r3, 16
194	addi	r7, 16
195	cmplti	r4, 16
196	jbf	.L12
197	ldw	r8, (sp, 0)
198	ldw	r9, (sp, 4)
199	ldw	r10, (sp, 8)
200	ldw	r11, (sp, 12)
201	addi	sp , 16
202	cmplti	r4, 4
203	bf	.L10
204	subi	r3, 3
205	br	.L_copy_by_byte
206
207.L_dest_aligned_but_src_not_aligned_2bytes:
208	cmplti	r4, 16
209	bf	.L21
210.L20:
211	GET_FRONT_BITS r1 16
212	mov	r5, r1
213	ldw	r6, (r3, 0)
214	mov	r1, r6
215	GET_AFTER_BITS r6 16
216	or	r5, r6
217	stw	r5, (r7, 0)
218	subi	r4, 4
219	addi	r3, 4
220	addi	r7, 4
221	cmplti	r4, 4
222	bf	.L20
223	subi	r3, 2
224	br	.L_copy_by_byte
225	rts
226
227.L21:	/* n > 16 */
228	subi 	sp, 16
229	stw	r8, (sp, 0)
230	stw	r9, (sp, 4)
231	stw	r10, (sp, 8)
232	stw	r11, (sp, 12)
233
234.L22:
235	ldw	r5, (r3, 0)
236	ldw	r11, (r3, 4)
237	ldw	r8, (r3, 8)
238	ldw	r9, (r3, 12)
239
240	GET_FRONT_BITS r1 16
241	mov	r10, r5
242	GET_AFTER_BITS r5 16
243	or	r5, r1
244
245	GET_FRONT_BITS r10 16
246	mov	r1, r11
247	GET_AFTER_BITS r11 16
248	or	r11, r10
249
250	GET_FRONT_BITS r1 16
251	mov	r10, r8
252	GET_AFTER_BITS r8 16
253	or	r8, r1
254
255	GET_FRONT_BITS r10 16
256	mov	r1, r9
257	GET_AFTER_BITS r9 16
258	or	r9, r10
259
260	stw	r5, (r7, 0)
261	stw	r11, (r7, 4)
262	stw	r8, (r7, 8)
263	stw	r9, (r7, 12)
264	subi	r4, 16
265	addi	r3, 16
266	addi	r7, 16
267	cmplti	r4, 16
268	jbf	.L22
269	ldw	r8, (sp, 0)
270	ldw	r9, (sp, 4)
271	ldw	r10, (sp, 8)
272	ldw	r11, (sp, 12)
273	addi	sp, 16
274	cmplti	r4, 4
275	bf	.L20
276	subi	r3, 2
277	br	.L_copy_by_byte
278
279
280.L_dest_aligned_but_src_not_aligned_3bytes:
281	cmplti	r4, 16
282	bf	.L31
283.L30:
284	GET_FRONT_BITS r1 24
285	mov	r5, r1
286	ldw	r6, (r3, 0)
287	mov	r1, r6
288	GET_AFTER_BITS r6 8
289	or	r5, r6
290	stw	r5, (r7, 0)
291	subi	r4, 4
292	addi	r3, 4
293	addi	r7, 4
294	cmplti	r4, 4
295	bf	.L30
296	subi	r3, 1
297	br	.L_copy_by_byte
298.L31:
299	subi	sp, 16
300	stw	r8, (sp, 0)
301	stw	r9, (sp, 4)
302	stw	r10, (sp, 8)
303	stw	r11, (sp, 12)
304.L32:
305	ldw	r5, (r3, 0)
306	ldw	r11, (r3, 4)
307	ldw	r8, (r3, 8)
308	ldw	r9, (r3, 12)
309
310	GET_FRONT_BITS r1 24
311	mov	r10, r5
312	GET_AFTER_BITS r5 8
313	or	r5, r1
314
315	GET_FRONT_BITS r10 24
316	mov	r1, r11
317	GET_AFTER_BITS r11 8
318	or	r11, r10
319
320	GET_FRONT_BITS r1 24
321	mov	r10, r8
322	GET_AFTER_BITS r8 8
323	or	r8, r1
324
325	GET_FRONT_BITS r10 24
326	mov	r1, r9
327	GET_AFTER_BITS r9 8
328	or	r9, r10
329
330	stw	r5, (r7, 0)
331	stw	r11, (r7, 4)
332	stw	r8, (r7, 8)
333	stw	r9, (r7, 12)
334	subi	r4, 16
335	addi	r3, 16
336	addi	r7, 16
337	cmplti	r4, 16
338	jbf	.L32
339	ldw	r8, (sp, 0)
340	ldw	r9, (sp, 4)
341	ldw	r10, (sp, 8)
342	ldw	r11, (sp, 12)
343	addi	sp, 16
344	cmplti	r4, 4
345	bf	.L30
346	subi	r3, 1
347	br	.L_copy_by_byte
348