1 /* Generate assembler source containing symbol information
2  *
3  * Copyright 2002       by Kai Germaschewski
4  *
5  * This software may be used and distributed according to the terms
6  * of the GNU General Public License, incorporated herein by reference.
7  *
8  * Usage: nm -n vmlinux | scripts/symbols [--all-symbols] > symbols.S
9  *
10  * ChangeLog:
11  *
12  * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com>
13  *      Changed the compression method from stem compression to "table lookup"
14  *      compression
15  *
16  *      Table compression uses all the unused char codes on the symbols and
17  *  maps these to the most used substrings (tokens). For instance, it might
18  *  map char code 0xF7 to represent "write_" and then in every symbol where
19  *  "write_" appears it can be replaced by 0xF7, saving 5 bytes.
20  *      The used codes themselves are also placed in the table so that the
21  *  decompresion can work without "special cases".
22  *      Applied to kernel symbols, this usually produces a compression ratio
23  *  of about 50%.
24  *
25  */
26 
27 #define _GNU_SOURCE
28 
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <stdint.h>
33 #include <stdbool.h>
34 #include <ctype.h>
35 
36 #define KSYM_NAME_LEN		127
37 
38 
39 struct sym_entry {
40 	unsigned long long addr;
41 	unsigned int len;
42 	unsigned char *sym;
43 	char *orig_symbol;
44 	unsigned int addr_idx;
45 	unsigned int stream_offset;
46 	unsigned char type;
47 };
48 #define SYMBOL_NAME(s) ((char *)(s)->sym + 1)
49 
50 static struct sym_entry *table;
51 static unsigned int table_size, table_cnt;
52 static unsigned long long _stext, _etext, _sinittext, _einittext, _sextratext, _eextratext;
53 static int all_symbols = 0;
54 static int sort_by_name = 0;
55 static int map_only = 0;
56 static char symbol_prefix_char = '\0';
57 static enum { fmt_bsd, fmt_sysv } input_format;
58 static int compare_name(const void *p1, const void *p2);
59 
60 int token_profit[0x10000];
61 
62 /* the table that holds the result of the compression */
63 unsigned char best_table[256][2];
64 unsigned char best_table_len[256];
65 
66 
usage(void)67 static void usage(void)
68 {
69 	fprintf(stderr, "Usage: symbols [--all-symbols] [--symbol-prefix=<prefix char>] < in.map > out.S\n");
70 	exit(1);
71 }
72 
73 /*
74  * This ignores the intensely annoying "mapping symbols" found
75  * in ARM ELF files: $a, $t and $d.
76  */
is_arm_mapping_symbol(const char * str)77 static inline int is_arm_mapping_symbol(const char *str)
78 {
79 	return str[0] == '$' && strchr("atd", str[1])
80 	       && (str[2] == '\0' || str[2] == '.');
81 }
82 
read_symbol(FILE * in,struct sym_entry * s)83 static int read_symbol(FILE *in, struct sym_entry *s)
84 {
85 	char str[500], type[20] = "";
86 	char *sym, stype;
87 	static enum { symbol, single_source, multi_source } last;
88 	static char *filename;
89 	int rc = -1;
90 
91 	switch (input_format) {
92 	case fmt_bsd:
93 		rc = fscanf(in, "%llx %c %499s\n", &s->addr, &stype, str);
94 		break;
95 	case fmt_sysv:
96 		while (fscanf(in, "\n") == 1)
97 			/* nothing */;
98 		rc = fscanf(in, "%499[^ |] |%llx | %c |",
99 			    str, &s->addr, &stype);
100 		if (rc == 3 && fscanf(in, " %19[^ |] |", type) != 1)
101 			*type = '\0';
102 		break;
103 	}
104 	if (rc != 3) {
105 		if (rc != EOF) {
106 			/* skip line */
107 			if (fgets(str, 500, in) == NULL)
108 				return -1; /* must check fgets result */
109 		}
110 		return -1;
111 	}
112 
113 	sym = strrchr(str, '.');
114 	if (strcasecmp(type, "FILE") == 0 ||
115 	    (/*
116 	      * GNU nm prior to binutils commit 552e55ed06 (expected to
117 	      * appear in 2.27) doesn't produce a type for EFI binaries.
118 	      */
119 	     input_format == fmt_sysv && !*type && stype == '?' && sym &&
120 	     sym[1] && strchr("cSsoh", sym[1]) && !sym[2])) {
121 		/*
122 		 * gas prior to binutils commit fbdf9406b0 (expected to appear
123 		 * in 2.27) outputs symbol table entries resulting from .file
124 		 * in reverse order. If we get two consecutive file symbols,
125 		 * prefer the first one if that names an object file or has a
126 		 * directory component (to cover multiply compiled files).
127 		 */
128 		bool multi = strchr(str, '/') || (sym && sym[1] == 'o');
129 
130 		if (multi || last != multi_source) {
131 			free(filename);
132 			filename = *str ? strdup(str) : NULL;
133 		}
134 		last = multi ? multi_source : single_source;
135 		goto skip_tail;
136 	}
137 
138 	last = symbol;
139 	rc = -1;
140 
141 	sym = str;
142 	/* skip prefix char */
143 	if (symbol_prefix_char && str[0] == symbol_prefix_char)
144 		sym++;
145 
146 	/* Ignore most absolute/undefined (?) symbols. */
147 	if (strcmp(sym, "_stext") == 0)
148 		_stext = s->addr;
149 	else if (strcmp(sym, "_etext") == 0)
150 		_etext = s->addr;
151 	else if (strcmp(sym, "_sinittext") == 0)
152 		_sinittext = s->addr;
153 	else if (strcmp(sym, "_einittext") == 0)
154 		_einittext = s->addr;
155 	else if (strcmp(sym, "_sextratext") == 0)
156 		_sextratext = s->addr;
157 	else if (strcmp(sym, "_eextratext") == 0)
158 		_eextratext = s->addr;
159 	else if (toupper((uint8_t)stype) == 'A')
160 	{
161 		/* Keep these useful absolute symbols */
162 		if (strcmp(sym, "__gp"))
163 			goto skip_tail;
164 	}
165 	else if (toupper((uint8_t)stype) == 'U' ||
166 		 toupper((uint8_t)stype) == 'N' ||
167 		 is_arm_mapping_symbol(sym))
168 		goto skip_tail;
169 	/* exclude also MIPS ELF local symbols ($L123 instead of .L123) */
170 	else if (str[0] == '$')
171 		goto skip_tail;
172 
173 	/* include the type field in the symbol name, so that it gets
174 	 * compressed together */
175 	s->len = strlen(str) + 1;
176 	if (islower(stype) && filename)
177 		s->len += strlen(filename) + 1;
178 	s->sym = malloc(s->len + 1);
179 	sym = SYMBOL_NAME(s);
180 	if (islower(stype) && filename) {
181 		sym = stpcpy(sym, filename);
182 		*sym++ = '#';
183 	}
184 	strcpy(sym, str);
185 	if (sort_by_name || map_only) {
186 		s->orig_symbol = strdup(SYMBOL_NAME(s));
187 		s->type = stype; /* As s->sym[0] ends mangled. */
188 	}
189 	s->sym[0] = stype;
190 	rc = 0;
191 
192  skip_tail:
193 	if ((input_format == fmt_sysv) && fgets(str, 500, in) == NULL)
194 		/* ignore errors while discarding rest of line */;
195 
196 	return rc;
197 }
198 
symbol_valid(struct sym_entry * s)199 static int symbol_valid(struct sym_entry *s)
200 {
201 	int offset = 1;
202 
203 	/* skip prefix char */
204 	if (symbol_prefix_char && *(s->sym + 1) == symbol_prefix_char)
205 		offset++;
206 
207 	/* if --all-symbols is not specified, then symbols outside the text
208 	 * and inittext sections are discarded */
209 	if (!all_symbols) {
210 		if ((s->addr < _stext || s->addr > _etext)
211 		    && (s->addr < _sinittext || s->addr > _einittext)
212 		    && (s->addr < _sextratext || s->addr > _eextratext))
213 			return 0;
214 		/* Corner case.  Discard any symbols with the same value as
215 		 * _etext _einittext or _eextratext; they can move between pass
216 		 * 1 and 2 when the symbols data are added.  If these symbols
217 		 * move then they may get dropped in pass 2, which breaks the
218 		 * symbols rules.
219 		 */
220 		if ((s->addr == _etext && strcmp((char*)s->sym + offset, "_etext")) ||
221 		    (s->addr == _einittext && strcmp((char*)s->sym + offset, "_einittext")) ||
222 		    (s->addr == _eextratext && strcmp((char*)s->sym + offset, "_eextratext")))
223 			return 0;
224 	}
225 
226 	/* Exclude symbols which vary between passes. */
227 	if (strstr((char *)s->sym + offset, "_compiled."))
228 		return 0;
229 
230 	return 1;
231 }
232 
read_map(FILE * in)233 static void read_map(FILE *in)
234 {
235 	while (!feof(in)) {
236 		if (table_cnt >= table_size) {
237 			table_size += 10000;
238 			table = realloc(table, sizeof(*table) * table_size);
239 			if (!table) {
240 				fprintf(stderr, "out of memory\n");
241 				exit (1);
242 			}
243 		}
244 		if (read_symbol(in, &table[table_cnt]) == 0)
245 			table_cnt++;
246 	}
247 }
248 
output_label(char * label)249 static void output_label(char *label)
250 {
251 	if (symbol_prefix_char)
252 		printf(".globl %c%s\n", symbol_prefix_char, label);
253 	else
254 		printf(".globl %s\n", label);
255 	printf("\tALGN\n");
256 	if (symbol_prefix_char)
257 		printf("%c%s:\n", symbol_prefix_char, label);
258 	else
259 		printf("%s:\n", label);
260 }
261 
262 /* uncompress a compressed symbol. When this function is called, the best table
263  * might still be compressed itself, so the function needs to be recursive */
expand_symbol(unsigned char * data,int len,char * result)264 static int expand_symbol(unsigned char *data, int len, char *result)
265 {
266 	int c, rlen, total=0;
267 
268 	while (len) {
269 		c = *data;
270 		/* if the table holds a single char that is the same as the one
271 		 * we are looking for, then end the search */
272 		if (best_table[c][0]==c && best_table_len[c]==1) {
273 			*result++ = c;
274 			total++;
275 		} else {
276 			/* if not, recurse and expand */
277 			rlen = expand_symbol(best_table[c], best_table_len[c], result);
278 			total += rlen;
279 			result += rlen;
280 		}
281 		data++;
282 		len--;
283 	}
284 	*result=0;
285 
286 	return total;
287 }
288 
289 /* Sort by original (non mangled) symbol name, then type. */
compare_name_orig(const void * p1,const void * p2)290 static int compare_name_orig(const void *p1, const void *p2)
291 {
292 	const struct sym_entry *sym1 = p1;
293 	const struct sym_entry *sym2 = p2;
294 	int rc;
295 
296 	rc = strcmp(sym1->orig_symbol, sym2->orig_symbol);
297 
298 	if (!rc)
299 		rc = sym1->type - sym2->type;
300 
301 	return rc;
302 }
303 
write_src(void)304 static void write_src(void)
305 {
306 	unsigned int i, k, off;
307 	unsigned int best_idx[256];
308 	unsigned int *markers;
309 	char buf[KSYM_NAME_LEN+1];
310 
311 	if (map_only) {
312 		for (i = 0; i < table_cnt; i++)
313 			printf("%#llx %c %s\n", table[i].addr, table[i].type,
314 						table[i].orig_symbol);
315 
316 		return;
317 	}
318 	printf("#include <xen/config.h>\n");
319 	printf("#include <asm/types.h>\n");
320 	printf("#if BITS_PER_LONG == 64 && !defined(SYMBOLS_ORIGIN)\n");
321 	printf("#define PTR .quad\n");
322 	printf("#define ALGN .align 8\n");
323 	printf("#else\n");
324 	printf("#define PTR .long\n");
325 	printf("#define ALGN .align 4\n");
326 	printf("#endif\n");
327 
328 	printf("\t.section .rodata, \"a\"\n");
329 
330 	printf("#ifndef SYMBOLS_ORIGIN\n");
331 	printf("#define SYMBOLS_ORIGIN 0\n");
332 	output_label("symbols_addresses");
333 	printf("#else\n");
334 	output_label("symbols_offsets");
335 	printf("#endif\n");
336 	for (i = 0; i < table_cnt; i++) {
337 		printf("\tPTR\t%#llx - SYMBOLS_ORIGIN\n", table[i].addr);
338 	}
339 	printf("\n");
340 
341 	output_label("symbols_num_syms");
342 	printf("\t.long\t%d\n", table_cnt);
343 	printf("\n");
344 
345 	/* table of offset markers, that give the offset in the compressed stream
346 	 * every 256 symbols */
347 	markers = (unsigned int *) malloc(sizeof(unsigned int) * ((table_cnt + 255) / 256));
348 
349 	output_label("symbols_names");
350 	off = 0;
351 	for (i = 0; i < table_cnt; i++) {
352 		if ((i & 0xFF) == 0)
353 			markers[i >> 8] = off;
354 
355 		printf("\t.byte 0x%02x", table[i].len);
356 		for (k = 0; k < table[i].len; k++)
357 			printf(", 0x%02x", table[i].sym[k]);
358 		printf("\n");
359 
360 		table[i].stream_offset = off;
361 		off += table[i].len + 1;
362 	}
363 	printf("\n");
364 
365 	output_label("symbols_markers");
366 	for (i = 0; i < ((table_cnt + 255) >> 8); i++)
367 		printf("\t.long\t%d\n", markers[i]);
368 	printf("\n");
369 
370 
371 	output_label("symbols_token_table");
372 	off = 0;
373 	for (i = 0; i < 256; i++) {
374 		best_idx[i] = off;
375 		expand_symbol(best_table[i], best_table_len[i], buf);
376 		printf("\t.asciz\t\"%s\"\n", buf);
377 		off += strlen(buf) + 1;
378 	}
379 	printf("\n");
380 
381 	output_label("symbols_token_index");
382 	for (i = 0; i < 256; i++)
383 		printf("\t.short\t%d\n", best_idx[i]);
384 	printf("\n");
385 
386 	if (!sort_by_name) {
387 		free(markers);
388 		return;
389 	}
390 
391 	/* Sorted by original symbol names and type. */
392 	qsort(table, table_cnt, sizeof(*table), compare_name_orig);
393 
394 	output_label("symbols_sorted_offsets");
395 	/* A fixed sized array with two entries: offset in the
396 	 * compressed stream (for symbol name), and offset in
397 	 * symbols_addresses (or symbols_offset). */
398 	for (i = 0; i < table_cnt; i++) {
399 		printf("\t.long %u, %u\n", table[i].stream_offset, table[i].addr_idx);
400 	}
401 	printf("\n");
402 
403 	free(markers);
404 }
405 
406 
407 /* table lookup compression functions */
408 
409 /* count all the possible tokens in a symbol */
learn_symbol(unsigned char * symbol,int len)410 static void learn_symbol(unsigned char *symbol, int len)
411 {
412 	int i;
413 
414 	for (i = 0; i < len - 1; i++)
415 		token_profit[ symbol[i] + (symbol[i + 1] << 8) ]++;
416 }
417 
418 /* decrease the count for all the possible tokens in a symbol */
forget_symbol(unsigned char * symbol,int len)419 static void forget_symbol(unsigned char *symbol, int len)
420 {
421 	int i;
422 
423 	for (i = 0; i < len - 1; i++)
424 		token_profit[ symbol[i] + (symbol[i + 1] << 8) ]--;
425 }
426 
427 /* remove all the invalid symbols from the table and do the initial token count */
build_initial_tok_table(void)428 static void build_initial_tok_table(void)
429 {
430 	unsigned int i, pos;
431 
432 	pos = 0;
433 	for (i = 0; i < table_cnt; i++) {
434 		if ( symbol_valid(&table[i]) ) {
435 			if (pos != i)
436 				table[pos] = table[i];
437 			learn_symbol(table[pos].sym, table[pos].len);
438 			pos++;
439 		}
440 	}
441 	table_cnt = pos;
442 }
443 
memmem_pvt(void * h,size_t hlen,void * n,size_t nlen)444 static void *memmem_pvt(void *h, size_t hlen, void *n, size_t nlen)
445 {
446 	char *p;
447 	for (p = h; (p - (char *)h) <= (long)(hlen - nlen); p++)
448 		if (!memcmp(p, n, nlen)) return p;
449 	return NULL;
450 }
451 
452 /* replace a given token in all the valid symbols. Use the sampled symbols
453  * to update the counts */
compress_symbols(unsigned char * str,int idx)454 static void compress_symbols(unsigned char *str, int idx)
455 {
456 	unsigned int i, len, size;
457 	unsigned char *p1, *p2;
458 
459 	for (i = 0; i < table_cnt; i++) {
460 
461 		len = table[i].len;
462 		p1 = table[i].sym;
463 
464 		table[i].addr_idx = i;
465 		/* find the token on the symbol */
466 		p2 = memmem_pvt(p1, len, str, 2);
467 		if (!p2) continue;
468 
469 		/* decrease the counts for this symbol's tokens */
470 		forget_symbol(table[i].sym, len);
471 
472 		size = len;
473 
474 		do {
475 			*p2 = idx;
476 			p2++;
477 			size -= (p2 - p1);
478 			memmove(p2, p2 + 1, size);
479 			p1 = p2;
480 			len--;
481 
482 			if (size < 2) break;
483 
484 			/* find the token on the symbol */
485 			p2 = memmem_pvt(p1, size, str, 2);
486 
487 		} while (p2);
488 
489 		table[i].len = len;
490 
491 		/* increase the counts for this symbol's new tokens */
492 		learn_symbol(table[i].sym, len);
493 	}
494 }
495 
496 /* search the token with the maximum profit */
find_best_token(void)497 static int find_best_token(void)
498 {
499 	int i, best, bestprofit;
500 
501 	bestprofit=-10000;
502 	best = 0;
503 
504 	for (i = 0; i < 0x10000; i++) {
505 		if (token_profit[i] > bestprofit) {
506 			best = i;
507 			bestprofit = token_profit[i];
508 		}
509 	}
510 	return best;
511 }
512 
513 /* this is the core of the algorithm: calculate the "best" table */
optimize_result(void)514 static void optimize_result(void)
515 {
516 	int i, best;
517 
518 	/* using the '\0' symbol last allows compress_symbols to use standard
519 	 * fast string functions */
520 	for (i = 255; i >= 0; i--) {
521 
522 		/* if this table slot is empty (it is not used by an actual
523 		 * original char code */
524 		if (!best_table_len[i]) {
525 
526 			/* find the token with the breates profit value */
527 			best = find_best_token();
528 			if (token_profit[best] == 0)
529 			        break;
530 
531 			/* place it in the "best" table */
532 			best_table_len[i] = 2;
533 			best_table[i][0] = best & 0xFF;
534 			best_table[i][1] = (best >> 8) & 0xFF;
535 
536 			/* replace this token in all the valid symbols */
537 			compress_symbols(best_table[i], i);
538 		}
539 	}
540 }
541 
542 /* start by placing the symbols that are actually used on the table */
insert_real_symbols_in_table(void)543 static void insert_real_symbols_in_table(void)
544 {
545 	unsigned int i, j, c;
546 
547 	memset(best_table, 0, sizeof(best_table));
548 	memset(best_table_len, 0, sizeof(best_table_len));
549 
550 	for (i = 0; i < table_cnt; i++) {
551 		for (j = 0; j < table[i].len; j++) {
552 			c = table[i].sym[j];
553 			best_table[c][0]=c;
554 			best_table_len[c]=1;
555 		}
556 	}
557 }
558 
optimize_token_table(void)559 static void optimize_token_table(void)
560 {
561 	build_initial_tok_table();
562 
563 	insert_real_symbols_in_table();
564 
565 	/* When valid symbol is not registered, exit to error */
566 	if (!table_cnt) {
567 		fprintf(stderr, "No valid symbol.\n");
568 		exit(1);
569 	}
570 
571 	optimize_result();
572 }
573 
compare_value(const void * p1,const void * p2)574 static int compare_value(const void *p1, const void *p2)
575 {
576 	const struct sym_entry *sym1 = p1;
577 	const struct sym_entry *sym2 = p2;
578 
579 	if (sym1->addr < sym2->addr)
580 		return -1;
581 	if (sym1->addr > sym2->addr)
582 		return +1;
583 	/* Prefer global symbols. */
584 	if (isupper(*sym1->sym))
585 		return -1;
586 	if (isupper(*sym2->sym))
587 		return +1;
588 	return 0;
589 }
590 
compare_name(const void * p1,const void * p2)591 static int compare_name(const void *p1, const void *p2)
592 {
593 	const struct sym_entry *sym1 = p1;
594 	const struct sym_entry *sym2 = p2;
595 
596 	return strcmp(SYMBOL_NAME(sym1), SYMBOL_NAME(sym2));
597 }
598 
main(int argc,char ** argv)599 int main(int argc, char **argv)
600 {
601 	unsigned int i;
602 	bool unsorted = false, warn_dup = false, error_dup = false, found_dup = false;
603 
604 	if (argc >= 2) {
605 		for (i = 1; i < argc; i++) {
606 			if(strcmp(argv[i], "--all-symbols") == 0)
607 				all_symbols = 1;
608 			else if (strncmp(argv[i], "--symbol-prefix=", 16) == 0) {
609 				char *p = &argv[i][16];
610 				/* skip quote */
611 				if ((*p == '"' && *(p+2) == '"') || (*p == '\'' && *(p+2) == '\''))
612 					p++;
613 				symbol_prefix_char = *p;
614 			} else if (strcmp(argv[i], "--sysv") == 0)
615 				input_format = fmt_sysv;
616 			else if (strcmp(argv[i], "--sort") == 0)
617 				unsorted = true;
618 			else if (strcmp(argv[i], "--sort-by-name") == 0)
619 				sort_by_name = 1;
620 			else if (strcmp(argv[i], "--warn-dup") == 0)
621 				warn_dup = true;
622 			else if (strcmp(argv[i], "--error-dup") == 0)
623 				warn_dup = error_dup = true;
624 			else if (strcmp(argv[i], "--xensyms") == 0)
625 				map_only = true;
626 			else
627 				usage();
628 		}
629 	} else if (argc != 1)
630 		usage();
631 
632 	read_map(stdin);
633 
634 	if (warn_dup) {
635 		qsort(table, table_cnt, sizeof(*table), compare_name);
636 		for (i = 1; i < table_cnt; ++i)
637 			if (strcmp(SYMBOL_NAME(table + i - 1),
638 				   SYMBOL_NAME(table + i)) == 0 &&
639 			    table[i - 1].addr != table[i].addr) {
640 				fprintf(stderr,
641 					"Duplicate symbol '%s' (%llx != %llx)\n",
642 					SYMBOL_NAME(table + i),
643 					table[i].addr, table[i - 1].addr);
644 				found_dup = true;
645 			}
646 		unsorted = true;
647 	}
648 
649 	if (error_dup && found_dup)
650 		exit(1);
651 
652 	if (unsorted)
653 		qsort(table, table_cnt, sizeof(*table), compare_value);
654 
655 	optimize_token_table();
656 	write_src();
657 
658 	return 0;
659 }
660