1#!/usr/bin/perl -w
2#
3# Written with reference to pandoc_markdown from Debian jessie
4# We require atx-style headers
5#
6# usage:
7#   pandoc -t json SUPPORT.md >j-unstable
8#   git cat-file blob origin/staging-4.11:SUPPORT.md | pandoc -t json >j-4.11
9#   docs/parse-support-md \
10#            j-unstable https://xenbits/unstable/SUPPORT.html
11#            j-4.11 https://xenbits/4.11/SUPPORT.html
12# or equivalent
13
14use strict;
15use JSON;
16use Tie::IxHash;
17use IO::File;
18use CGI qw(escapeHTML);
19use Data::Dumper;
20use POSIX;
21
22#---------- accumulating input/output ----------
23
24# This combines information from all of the input files.
25
26sub new_sectlist () { { } };
27our $toplevel_sectlist = new_sectlist();
28# an $sectlist is
29#   { }                 nothing seen yet
30#   a tied hashref      something seen
31# (tied $sectlist)    is an object of type Tie::IxHash
32# $sectlist->{KEY} a $sectnode:
33# $sectlist->{KEY}{Status}[VI] = absent or string or markdown content
34# $sectlist->{KEY}{Children} = a further $sectlist
35# $sectlist->{KEY}{Key} = KEY
36# $sectlist->{KEY}{RealSectNode} = us, or our parent
37# $sectlist->{KEY}{RealSectNode}{HasCaveat}[VI] = trueish iff other in a Para
38# $sectlist->{KEY}{RealInSect} = containing real section in @insections, so
39# $sectlist->{KEY}{RealInSect}{HasDescription} = VI for some Emph in Para
40# $sectlist->{KEY}{RealInSect}{Anchor} = value for < id="" > in the pandoc html
41# A $sectnode represents a single section from the original markdown
42# document.  Its subsections are in Children.
43#
44# Also, the input syntax:
45#    Status, something or other: Supported
46# is treated as a $sectnode, is as if it were a subsection -
47# one called `something or other'.  That is not a `real' section.
48#
49# KEY is the Anchor, or derived from the `something or other'.
50# It is used to match up identical features in different versions.
51
52#---------- state for this input file ----------
53
54our $version_index;
55our @version_urls;
56
57our @insections;
58# $insections[]{Key} = string
59# $insections[]{Headline} = markdown content
60# these next are only defined for real sections, not Status elements
61# $insections[]{Anchor} = string
62# $insections[]{HasDescription} VI, likewise
63
64our $had_unknown;
65our $had_feature;
66# adding new variable ?  it must be reset in r_toplevel
67
68#---------- parsing ----------
69
70sub find_current_sectnode () {
71    die unless @insections;
72
73    my $sectnode;
74    my $realinsect;
75    my $realsectnode;
76    foreach my $s (@insections) {
77        my $sectlist = $sectnode
78            ? $sectnode->{Children} : $toplevel_sectlist;
79        my $key = $s->{Key};
80        $realinsect = $s if $s->{Anchor};
81        tie %$sectlist, 'Tie::IxHash' unless tied %$sectlist;
82#print STDERR "FIND_CURRENT_SECTNODE ", Dumper($s);
83        $sectlist->{$key} //=
84            {
85             Children => new_sectlist(),
86             Headline => $s->{Headline},
87             Key => $key,
88             RealInSect => $realinsect,
89             HasCaveat => [],
90            };
91        $sectnode = $sectlist->{$key};
92        $realsectnode = $sectnode if $s->{Anchor};
93        $sectnode->{RealSectNode} = $realsectnode;
94    }
95    die unless $sectnode;
96    return $sectnode;
97}
98
99sub ri_Header {
100    my ($c) = @_;
101    my ($level, $infos, $hl) = @$c;
102#print STDERR 'RI_HEADER ', Dumper($c, \@c);
103    my ($id) = @$infos;
104    die unless $level >= 1;
105    die unless $level-2 <= $#insections;
106    $#insections = $level-2;
107    push @insections,
108        {
109         Key => $id,
110         Anchor => $id,
111         Headline => $hl,
112         HasDescription => undef,
113        };
114#print STDERR Dumper(\@insections);
115    $had_feature = 0;
116}
117
118sub ri_Para {
119    return unless @insections;
120    my $insection = $insections[$#insections];
121#    print DEBUG "ri_Para ",
122#        Dumper($version_index, $had_feature, $insection);
123
124    if ($had_feature) {
125        my $sectnode = find_current_sectnode();
126        $sectnode->{RealSectNode}{HasCaveat}[$version_index] = 1;
127    } else {
128        $insection->{HasDescription} //= $version_index;
129    }
130};
131
132sub parse_feature_entry ($) {
133    my ($value) = @_;
134
135    $had_feature = 1;
136    my $sectnode = find_current_sectnode();
137    $sectnode->{Status}[$version_index] = $value;
138}
139
140sub descr2key ($) {
141    my ($descr) = @_;
142
143    die unless @insections;
144    my $insection = $insections[$#insections];
145
146    my $key = lc $descr;
147    $key =~ y/ /-/;
148    $key =~ y/-0-9A-Za-z//cd;
149    $key = $insection->{Anchor}.'--'.$key;
150    return $key;
151}
152
153sub ri_CodeBlock {
154    my ($c) = @_;
155    my ($infos, $text) = @$c;
156
157    if ($text =~ m{^(?: Functional\ completeness
158                   | Functional\ stability
159                   | Interface\ stability
160                   | Security\ supported ) \:}x) {
161        # ignore this
162        return;
163    }
164    die "$had_unknown / $text ?" if $had_unknown;
165
166    my $toplevel = $text =~ m{^Xen-Version:};
167
168    foreach my $l (split /\n/, $text) {
169        $l =~ s/\s*$//;
170        next unless $l =~ m/\S/;
171
172        my ($descr, $value) =
173            $toplevel
174            ? $l =~ m{^([A-Z][-A-Z0-9a-z]+)\:\s+(\S.*)$}
175            : $l =~ m{^(?:Status|Supported)(?:\,\s*([^:]+))?\:\s+(\S.*)$}
176            or die ("$text\n^ cannot parse status codeblock line:".
177                    ($toplevel and 'top').
178                    "\n$l\n ?");
179
180        if (length $descr) {
181            push @insections,
182                {
183                 Key => descr2key($descr),
184                 Headline => [{ t => 'Str', c => $descr }],
185                };
186        }
187        parse_feature_entry $value;
188        if (length $descr) {
189            pop @insections;
190        }
191    }
192}
193
194sub ri_DefinitionList {
195    my ($c) = @_;
196    foreach my $defent (@$c) {
197        my ($term, $defns) = @$defent;
198        my $descr =
199            join ' ',
200            map { $_->{c} }
201            grep { $_->{t} eq 'Str' }
202            @$term;
203        push @insections,
204            {
205             Key => descr2key($descr),
206             Headline => $term,
207            };
208        die "multiple definitions in definition list definition"
209            if @$defns > 1;
210        my $defn = $defns->[0];
211        die "multiple paras in definition list definition"
212            if @$defn > 1;
213        my $defnp = $defn->[0];
214        die "only understand plain definition not $defnp->{t} ?"
215            unless $defnp->{t} eq 'Plain';
216        parse_feature_entry $defnp->{c};
217        pop @insections;
218    }
219}
220
221sub process_unknown {
222    my ($c, $e) = @_;
223    $had_unknown = Dumper($e);
224}
225
226sub r_content ($) {
227    my ($i) = @_;
228    foreach my $e (@$i) {
229        my $f = ${*::}{"ri_$e->{t}"};
230        $f //= \&process_unknown;
231        $f->($e->{c}, $e);
232    }
233}
234
235our $pandoc_toplevel_constructor;
236
237sub r_toplevel ($) {
238    my ($i) = @_;
239
240    die unless defined $version_index;
241
242    @insections = ();
243    $had_unknown = undef;
244    $had_feature = undef;
245
246    # Pandoc's JSON output changed some time between 1.17.2 (stretch)
247    # and 2.2.1 (buster).  I can find no documentation about this
248    # change or about the compatibility rules.  (It seems that
249    # processing the parse tree *is* supported upstream: they offer
250    # many libraries to do this inside the pandoc process.)
251    # Empirically, what has changed is just the top level structure.
252    # Also pandoc wants the same structure back that it spat out,
253    # when we ask it to format snippets.
254
255    my $blocks;
256    if (ref $i eq 'ARRAY') {
257	$pandoc_toplevel_constructor = sub {
258	    my ($blocks) = @_;
259	    return [
260		    { unMeta => { } },
261		    $blocks,
262		   ];
263	};
264	foreach my $e (@$i) {
265	    next unless ref $e eq 'ARRAY';
266	    r_content $e;
267	}
268    } elsif (ref $i eq 'HASH') {
269	my $api_version = $i->{'pandoc-api-version'};
270	$pandoc_toplevel_constructor = sub {
271	    my ($blocks) = @_;
272	    return {
273		    blocks => $blocks,
274		    meta => { },
275		    'pandoc-api-version' => $api_version,
276		   };
277	};
278	r_content $i->{blocks};
279    } else {
280	die;
281    }
282}
283
284sub read_inputs () {
285    $version_index = 0;
286
287    local $/;
288    undef $/;
289
290    while (my $f = shift @ARGV) {
291        push @version_urls, shift @ARGV;
292        eval {
293            open F, '<', $f or die $!;
294            my $input_toplevel = decode_json <F>;
295            r_toplevel $input_toplevel;
296        };
297        die "$@\nwhile processing input file $f\n" if $@;
298        $version_index++;
299    }
300}
301
302#---------- reprocessing ----------
303
304# variables generated by analyse_reprocess:
305our $maxdepth;
306
307sub pandoc2html_inline ($) {
308    my ($content) = @_;
309
310    my $json_fh = IO::File::new_tmpfile or die $!;
311
312    my $blocks = [{ t => 'Para', c => $content }];
313    my $data = $pandoc_toplevel_constructor->($blocks);
314    my $j = to_json($data) or die $!;
315    print $json_fh $j;
316    flush $json_fh or die $!;
317    seek $json_fh,0,0 or die $!;
318
319    my $c = open PD, "-|" // die $!;
320    if (!$c) {
321        open STDIN, "<&", $json_fh;
322        exec qw(pandoc -f json) or die $!;
323    }
324
325    local $/;
326    undef $/;
327    my $html = <PD>;
328    $?=$!=0;
329    if (!close PD) {
330        eval {
331            seek $json_fh,0,0 or die $!;
332            open STDIN, '<&', $json_fh or die $!;
333            system 'json_pp';
334        };
335        die "$j \n $? $!";
336    }
337
338    $html =~ s{^\<p\>}{} or die "$html ?";
339    $html =~ s{\</p\>$}{} or die "$html ?";
340    $html =~ s{\n$}{};
341    return $html;
342}
343
344sub reprocess_sectlist ($$);
345
346sub reprocess_sectnode ($$) {
347    my ($sectnode, $d) = @_;
348
349    $sectnode->{Depth} = $d;
350
351    if ($sectnode->{Status}) {
352        $maxdepth = $d if $d > $maxdepth;
353    }
354
355    if ($sectnode->{Headline}) {
356#            print STDERR Dumper($sectnode);
357        $sectnode->{Headline} =
358            pandoc2html_inline $sectnode->{Headline};
359    }
360
361    reprocess_sectlist $sectnode->{Children}, $d;
362}
363
364sub reprocess_sectlist ($$) {
365    my ($sectlist, $d) = @_;
366    $d++;
367
368    foreach my $sectnode (values %$sectlist) {
369        reprocess_sectnode $sectnode, $d;
370    }
371}
372
373sub count_rows_sectlist ($);
374
375sub count_rows_sectnode ($) {
376    my ($sectnode) = @_;
377    my $rows = 0;
378    $sectnode->{RealInSect}{OwnRows} //= 0;
379    if ($sectnode->{Status}) {
380        $rows++;
381        $sectnode->{RealInSect}{OwnRows}++;
382    }
383    $rows += count_rows_sectlist $sectnode->{Children};
384    $sectnode->{Rows} = $rows;
385    $sectnode->{RealInSect}{Rows} = $rows;
386    return $rows;
387}
388
389# Now we have
390#   $sectnode->{Rows}
391#   $sectnode->{RealInSect}{Rows}
392#   $sectnode->{RealInSect}{OwnRows}
393
394sub count_rows_sectlist ($) {
395    my ($sectlist) = @_;
396    my $rows = 0;
397    foreach my $sectnode (values %$sectlist) {
398        $rows += count_rows_sectnode $sectnode;
399    }
400    return $rows;
401}
402
403# After reprocess_sectlist,
404#    ->{Headline}   is in html
405#    ->{Status}     is (still) string or markdown content
406
407sub analyse_reprocess () {
408    $maxdepth = 0;
409    reprocess_sectlist $toplevel_sectlist, 0;
410}
411
412#---------- output ----------
413
414sub o { print @_ or die $!; }
415
416our @pending_headings;
417
418sub docref_a ($$) {
419    my ($i, $realinsect) = @_;
420    return sprintf '<a href="%s#%s">',
421        $version_urls[$i], $realinsect->{Anchor};
422}
423
424sub write_output_row ($) {
425    my ($sectnode) = @_;
426#    print STDERR 'WOR ', Dumper($d, $sectnode);
427    o('<tr>');
428    my $span = sub {
429        my ($rowcol, $n) = @_;
430        o(sprintf ' %sspan="%d"', $rowcol, $n) if $n != 1;
431    };
432    # This is all a bit tricky because (i) the input is hierarchical
433    # with variable depth, whereas the output has to have a fixed
434    # number of heading columns on the LHS; (ii) the HTML
435    # colspan/rowspan system means that when we are writing out, we
436    # have to not write table elements for table entries which have
437    # already been written with a span instruction that covers what we
438    # would write now.
439    while (my $heading = shift @pending_headings) {
440        o('<th valign="top"');
441        o(sprintf ' id="%s"', $heading->{Key});
442        $span->('row', $heading->{Rows});
443        $span->('col', $maxdepth - $heading->{Depth} + 1)
444            if !%{ $heading->{Children} };
445        o(' align="left">');
446        my $end_a = '';
447        my $desc_i = $heading->{RealInSect}{HasDescription};
448        if (defined $desc_i) {
449            o(docref_a $desc_i, $heading->{RealInSect});
450            $end_a= '</a>';
451        }
452        o($heading->{Headline});
453        o($end_a);
454        o('</th>');
455    }
456    if (%{ $sectnode->{Children} }) {
457        # we suppressed the colspan above, but we do need to make the gap
458        my $n = $maxdepth - $sectnode->{Depth};
459        die 'XX '. Dumper($n, $sectnode) if $n<0;
460        if ($n) {
461            o('<td');
462            $span->('col', $n);
463            o('></td>');
464        }
465    }
466    for (my $i=0; $i<@version_urls; $i++) {
467        my $st = $sectnode->{Status}[$i];
468
469        my $colspan = $sectnode->{RealInSect}{ColSpan}[$i];
470        my $nextcell = '';
471        if (!defined $colspan) { # first row of this RealInSect
472            $colspan= ' colspan="2"';
473            if ($sectnode->{RealSectNode}{HasCaveat}[$i] && $st
474                && $sectnode->{RealInSect}{Anchor}) {
475                my $rows = $sectnode->{RealInSect}{OwnRows};
476                $nextcell = '<td';
477                $nextcell .= sprintf ' rowspan=%d', $rows if $rows>1;
478                $nextcell .= '>';
479                $nextcell .= docref_a $i, $sectnode->{RealInSect};
480                $nextcell .= '[*]</a>';
481                $nextcell .= '</td>';
482                $colspan = '';
483            }
484            $sectnode->{RealInSect}{ColSpan}[$i] = $colspan;
485        }
486
487        $st //= '-';
488        o("\n<td$colspan>");
489        my $end_a = '';
490        if ($sectnode->{Key} eq 'release-support--xen-version') {
491            o(sprintf '<a href="%s">', $version_urls[$i]);
492            $end_a = '</a>';
493        }
494        if (ref $st) {
495            $st = pandoc2html_inline $st;
496        } else {
497            $st = escapeHTML($st);
498        }
499        o($st);
500        o($end_a);
501        o('</td>');
502        o($nextcell);
503    }
504    o("</tr>\n");
505}
506
507sub write_output_sectlist ($);
508sub write_output_sectlist ($) {
509    my ($sectlist) = @_;
510    foreach my $key (keys %$sectlist) {
511        my $sectnode = $sectlist->{$key};
512        push @pending_headings, $sectnode;
513        write_output_row $sectnode if $sectnode->{Status};
514        write_output_sectlist $sectnode->{Children};
515    }
516}
517
518sub write_output () {
519    o('<table rules="all">');
520    write_output_sectlist $toplevel_sectlist;
521    o('</table>');
522}
523
524#---------- main program ----------
525
526open DEBUG, '>', '/dev/null' or die $!;
527if (@ARGV && $ARGV[0] eq '-D') {
528    shift @ARGV;
529    open DEBUG, '>&2' or die $!;
530}
531
532die unless @ARGV;
533die if $ARGV[0] =~ m/^-/;
534die if @ARGV % 2;
535
536read_inputs();
537
538#use Data::Dumper;
539#print DEBUG Dumper($toplevel_sectlist);
540
541analyse_reprocess();
542# Now Headline is in HTML
543
544count_rows_sectlist($toplevel_sectlist);
545
546#use Data::Dumper;
547print DEBUG Dumper($toplevel_sectlist);
548
549write_output();
550