1#!/usr/bin/perl -w 2# 3# Written with reference to pandoc_markdown from Debian jessie 4# We require atx-style headers 5# 6# usage: 7# pandoc -t json SUPPORT.md >j-unstable 8# git cat-file blob origin/staging-4.11:SUPPORT.md | pandoc -t json >j-4.11 9# docs/parse-support-md \ 10# j-unstable https://xenbits/unstable/SUPPORT.html 11# j-4.11 https://xenbits/4.11/SUPPORT.html 12# or equivalent 13 14use strict; 15use JSON; 16use Tie::IxHash; 17use IO::File; 18use CGI qw(escapeHTML); 19use Data::Dumper; 20use POSIX; 21 22#---------- accumulating input/output ---------- 23 24# This combines information from all of the input files. 25 26sub new_sectlist () { { } }; 27our $toplevel_sectlist = new_sectlist(); 28# an $sectlist is 29# { } nothing seen yet 30# a tied hashref something seen 31# (tied $sectlist) is an object of type Tie::IxHash 32# $sectlist->{KEY} a $sectnode: 33# $sectlist->{KEY}{Status}[VI] = absent or string or markdown content 34# $sectlist->{KEY}{Children} = a further $sectlist 35# $sectlist->{KEY}{Key} = KEY 36# $sectlist->{KEY}{RealSectNode} = us, or our parent 37# $sectlist->{KEY}{RealSectNode}{HasCaveat}[VI] = trueish iff other in a Para 38# $sectlist->{KEY}{RealInSect} = containing real section in @insections, so 39# $sectlist->{KEY}{RealInSect}{HasDescription} = VI for some Emph in Para 40# $sectlist->{KEY}{RealInSect}{Anchor} = value for < id="" > in the pandoc html 41# A $sectnode represents a single section from the original markdown 42# document. Its subsections are in Children. 43# 44# Also, the input syntax: 45# Status, something or other: Supported 46# is treated as a $sectnode, is as if it were a subsection - 47# one called `something or other'. That is not a `real' section. 48# 49# KEY is the Anchor, or derived from the `something or other'. 50# It is used to match up identical features in different versions. 51 52#---------- state for this input file ---------- 53 54our $version_index; 55our @version_urls; 56 57our @insections; 58# $insections[]{Key} = string 59# $insections[]{Headline} = markdown content 60# these next are only defined for real sections, not Status elements 61# $insections[]{Anchor} = string 62# $insections[]{HasDescription} VI, likewise 63 64our $had_unknown; 65our $had_feature; 66# adding new variable ? it must be reset in r_toplevel 67 68#---------- parsing ---------- 69 70sub find_current_sectnode () { 71 die unless @insections; 72 73 my $sectnode; 74 my $realinsect; 75 my $realsectnode; 76 foreach my $s (@insections) { 77 my $sectlist = $sectnode 78 ? $sectnode->{Children} : $toplevel_sectlist; 79 my $key = $s->{Key}; 80 $realinsect = $s if $s->{Anchor}; 81 tie %$sectlist, 'Tie::IxHash' unless tied %$sectlist; 82#print STDERR "FIND_CURRENT_SECTNODE ", Dumper($s); 83 $sectlist->{$key} //= 84 { 85 Children => new_sectlist(), 86 Headline => $s->{Headline}, 87 Key => $key, 88 RealInSect => $realinsect, 89 HasCaveat => [], 90 }; 91 $sectnode = $sectlist->{$key}; 92 $realsectnode = $sectnode if $s->{Anchor}; 93 $sectnode->{RealSectNode} = $realsectnode; 94 } 95 die unless $sectnode; 96 return $sectnode; 97} 98 99sub ri_Header { 100 my ($c) = @_; 101 my ($level, $infos, $hl) = @$c; 102#print STDERR 'RI_HEADER ', Dumper($c, \@c); 103 my ($id) = @$infos; 104 die unless $level >= 1; 105 die unless $level-2 <= $#insections; 106 $#insections = $level-2; 107 push @insections, 108 { 109 Key => $id, 110 Anchor => $id, 111 Headline => $hl, 112 HasDescription => undef, 113 }; 114#print STDERR Dumper(\@insections); 115 $had_feature = 0; 116} 117 118sub ri_Para { 119 return unless @insections; 120 my $insection = $insections[$#insections]; 121# print DEBUG "ri_Para ", 122# Dumper($version_index, $had_feature, $insection); 123 124 if ($had_feature) { 125 my $sectnode = find_current_sectnode(); 126 $sectnode->{RealSectNode}{HasCaveat}[$version_index] = 1; 127 } else { 128 $insection->{HasDescription} //= $version_index; 129 } 130}; 131 132sub parse_feature_entry ($) { 133 my ($value) = @_; 134 135 $had_feature = 1; 136 my $sectnode = find_current_sectnode(); 137 $sectnode->{Status}[$version_index] = $value; 138} 139 140sub descr2key ($) { 141 my ($descr) = @_; 142 143 die unless @insections; 144 my $insection = $insections[$#insections]; 145 146 my $key = lc $descr; 147 $key =~ y/ /-/; 148 $key =~ y/-0-9A-Za-z//cd; 149 $key = $insection->{Anchor}.'--'.$key; 150 return $key; 151} 152 153sub ri_CodeBlock { 154 my ($c) = @_; 155 my ($infos, $text) = @$c; 156 157 if ($text =~ m{^(?: Functional\ completeness 158 | Functional\ stability 159 | Interface\ stability 160 | Security\ supported ) \:}x) { 161 # ignore this 162 return; 163 } 164 die "$had_unknown / $text ?" if $had_unknown; 165 166 my $toplevel = $text =~ m{^Xen-Version:}; 167 168 foreach my $l (split /\n/, $text) { 169 $l =~ s/\s*$//; 170 next unless $l =~ m/\S/; 171 172 my ($descr, $value) = 173 $toplevel 174 ? $l =~ m{^([A-Z][-A-Z0-9a-z]+)\:\s+(\S.*)$} 175 : $l =~ m{^(?:Status|Supported)(?:\,\s*([^:]+))?\:\s+(\S.*)$} 176 or die ("$text\n^ cannot parse status codeblock line:". 177 ($toplevel and 'top'). 178 "\n$l\n ?"); 179 180 if (length $descr) { 181 push @insections, 182 { 183 Key => descr2key($descr), 184 Headline => [{ t => 'Str', c => $descr }], 185 }; 186 } 187 parse_feature_entry $value; 188 if (length $descr) { 189 pop @insections; 190 } 191 } 192} 193 194sub ri_DefinitionList { 195 my ($c) = @_; 196 foreach my $defent (@$c) { 197 my ($term, $defns) = @$defent; 198 my $descr = 199 join ' ', 200 map { $_->{c} } 201 grep { $_->{t} eq 'Str' } 202 @$term; 203 push @insections, 204 { 205 Key => descr2key($descr), 206 Headline => $term, 207 }; 208 die "multiple definitions in definition list definition" 209 if @$defns > 1; 210 my $defn = $defns->[0]; 211 die "multiple paras in definition list definition" 212 if @$defn > 1; 213 my $defnp = $defn->[0]; 214 die "only understand plain definition not $defnp->{t} ?" 215 unless $defnp->{t} eq 'Plain'; 216 parse_feature_entry $defnp->{c}; 217 pop @insections; 218 } 219} 220 221sub process_unknown { 222 my ($c, $e) = @_; 223 $had_unknown = Dumper($e); 224} 225 226sub r_content ($) { 227 my ($i) = @_; 228 foreach my $e (@$i) { 229 my $f = ${*::}{"ri_$e->{t}"}; 230 $f //= \&process_unknown; 231 $f->($e->{c}, $e); 232 } 233} 234 235our $pandoc_toplevel_constructor; 236 237sub r_toplevel ($) { 238 my ($i) = @_; 239 240 die unless defined $version_index; 241 242 @insections = (); 243 $had_unknown = undef; 244 $had_feature = undef; 245 246 # Pandoc's JSON output changed some time between 1.17.2 (stretch) 247 # and 2.2.1 (buster). I can find no documentation about this 248 # change or about the compatibility rules. (It seems that 249 # processing the parse tree *is* supported upstream: they offer 250 # many libraries to do this inside the pandoc process.) 251 # Empirically, what has changed is just the top level structure. 252 # Also pandoc wants the same structure back that it spat out, 253 # when we ask it to format snippets. 254 255 my $blocks; 256 if (ref $i eq 'ARRAY') { 257 $pandoc_toplevel_constructor = sub { 258 my ($blocks) = @_; 259 return [ 260 { unMeta => { } }, 261 $blocks, 262 ]; 263 }; 264 foreach my $e (@$i) { 265 next unless ref $e eq 'ARRAY'; 266 r_content $e; 267 } 268 } elsif (ref $i eq 'HASH') { 269 my $api_version = $i->{'pandoc-api-version'}; 270 $pandoc_toplevel_constructor = sub { 271 my ($blocks) = @_; 272 return { 273 blocks => $blocks, 274 meta => { }, 275 'pandoc-api-version' => $api_version, 276 }; 277 }; 278 r_content $i->{blocks}; 279 } else { 280 die; 281 } 282} 283 284sub read_inputs () { 285 $version_index = 0; 286 287 local $/; 288 undef $/; 289 290 while (my $f = shift @ARGV) { 291 push @version_urls, shift @ARGV; 292 eval { 293 open F, '<', $f or die $!; 294 my $input_toplevel = decode_json <F>; 295 r_toplevel $input_toplevel; 296 }; 297 die "$@\nwhile processing input file $f\n" if $@; 298 $version_index++; 299 } 300} 301 302#---------- reprocessing ---------- 303 304# variables generated by analyse_reprocess: 305our $maxdepth; 306 307sub pandoc2html_inline ($) { 308 my ($content) = @_; 309 310 my $json_fh = IO::File::new_tmpfile or die $!; 311 312 my $blocks = [{ t => 'Para', c => $content }]; 313 my $data = $pandoc_toplevel_constructor->($blocks); 314 my $j = to_json($data) or die $!; 315 print $json_fh $j; 316 flush $json_fh or die $!; 317 seek $json_fh,0,0 or die $!; 318 319 my $c = open PD, "-|" // die $!; 320 if (!$c) { 321 open STDIN, "<&", $json_fh; 322 exec qw(pandoc -f json) or die $!; 323 } 324 325 local $/; 326 undef $/; 327 my $html = <PD>; 328 $?=$!=0; 329 if (!close PD) { 330 eval { 331 seek $json_fh,0,0 or die $!; 332 open STDIN, '<&', $json_fh or die $!; 333 system 'json_pp'; 334 }; 335 die "$j \n $? $!"; 336 } 337 338 $html =~ s{^\<p\>}{} or die "$html ?"; 339 $html =~ s{\</p\>$}{} or die "$html ?"; 340 $html =~ s{\n$}{}; 341 return $html; 342} 343 344sub reprocess_sectlist ($$); 345 346sub reprocess_sectnode ($$) { 347 my ($sectnode, $d) = @_; 348 349 $sectnode->{Depth} = $d; 350 351 if ($sectnode->{Status}) { 352 $maxdepth = $d if $d > $maxdepth; 353 } 354 355 if ($sectnode->{Headline}) { 356# print STDERR Dumper($sectnode); 357 $sectnode->{Headline} = 358 pandoc2html_inline $sectnode->{Headline}; 359 } 360 361 reprocess_sectlist $sectnode->{Children}, $d; 362} 363 364sub reprocess_sectlist ($$) { 365 my ($sectlist, $d) = @_; 366 $d++; 367 368 foreach my $sectnode (values %$sectlist) { 369 reprocess_sectnode $sectnode, $d; 370 } 371} 372 373sub count_rows_sectlist ($); 374 375sub count_rows_sectnode ($) { 376 my ($sectnode) = @_; 377 my $rows = 0; 378 $sectnode->{RealInSect}{OwnRows} //= 0; 379 if ($sectnode->{Status}) { 380 $rows++; 381 $sectnode->{RealInSect}{OwnRows}++; 382 } 383 $rows += count_rows_sectlist $sectnode->{Children}; 384 $sectnode->{Rows} = $rows; 385 $sectnode->{RealInSect}{Rows} = $rows; 386 return $rows; 387} 388 389# Now we have 390# $sectnode->{Rows} 391# $sectnode->{RealInSect}{Rows} 392# $sectnode->{RealInSect}{OwnRows} 393 394sub count_rows_sectlist ($) { 395 my ($sectlist) = @_; 396 my $rows = 0; 397 foreach my $sectnode (values %$sectlist) { 398 $rows += count_rows_sectnode $sectnode; 399 } 400 return $rows; 401} 402 403# After reprocess_sectlist, 404# ->{Headline} is in html 405# ->{Status} is (still) string or markdown content 406 407sub analyse_reprocess () { 408 $maxdepth = 0; 409 reprocess_sectlist $toplevel_sectlist, 0; 410} 411 412#---------- output ---------- 413 414sub o { print @_ or die $!; } 415 416our @pending_headings; 417 418sub docref_a ($$) { 419 my ($i, $realinsect) = @_; 420 return sprintf '<a href="%s#%s">', 421 $version_urls[$i], $realinsect->{Anchor}; 422} 423 424sub write_output_row ($) { 425 my ($sectnode) = @_; 426# print STDERR 'WOR ', Dumper($d, $sectnode); 427 o('<tr>'); 428 my $span = sub { 429 my ($rowcol, $n) = @_; 430 o(sprintf ' %sspan="%d"', $rowcol, $n) if $n != 1; 431 }; 432 # This is all a bit tricky because (i) the input is hierarchical 433 # with variable depth, whereas the output has to have a fixed 434 # number of heading columns on the LHS; (ii) the HTML 435 # colspan/rowspan system means that when we are writing out, we 436 # have to not write table elements for table entries which have 437 # already been written with a span instruction that covers what we 438 # would write now. 439 while (my $heading = shift @pending_headings) { 440 o('<th valign="top"'); 441 o(sprintf ' id="%s"', $heading->{Key}); 442 $span->('row', $heading->{Rows}); 443 $span->('col', $maxdepth - $heading->{Depth} + 1) 444 if !%{ $heading->{Children} }; 445 o(' align="left">'); 446 my $end_a = ''; 447 my $desc_i = $heading->{RealInSect}{HasDescription}; 448 if (defined $desc_i) { 449 o(docref_a $desc_i, $heading->{RealInSect}); 450 $end_a= '</a>'; 451 } 452 o($heading->{Headline}); 453 o($end_a); 454 o('</th>'); 455 } 456 if (%{ $sectnode->{Children} }) { 457 # we suppressed the colspan above, but we do need to make the gap 458 my $n = $maxdepth - $sectnode->{Depth}; 459 die 'XX '. Dumper($n, $sectnode) if $n<0; 460 if ($n) { 461 o('<td'); 462 $span->('col', $n); 463 o('></td>'); 464 } 465 } 466 for (my $i=0; $i<@version_urls; $i++) { 467 my $st = $sectnode->{Status}[$i]; 468 469 my $colspan = $sectnode->{RealInSect}{ColSpan}[$i]; 470 my $nextcell = ''; 471 if (!defined $colspan) { # first row of this RealInSect 472 $colspan= ' colspan="2"'; 473 if ($sectnode->{RealSectNode}{HasCaveat}[$i] && $st 474 && $sectnode->{RealInSect}{Anchor}) { 475 my $rows = $sectnode->{RealInSect}{OwnRows}; 476 $nextcell = '<td'; 477 $nextcell .= sprintf ' rowspan=%d', $rows if $rows>1; 478 $nextcell .= '>'; 479 $nextcell .= docref_a $i, $sectnode->{RealInSect}; 480 $nextcell .= '[*]</a>'; 481 $nextcell .= '</td>'; 482 $colspan = ''; 483 } 484 $sectnode->{RealInSect}{ColSpan}[$i] = $colspan; 485 } 486 487 $st //= '-'; 488 o("\n<td$colspan>"); 489 my $end_a = ''; 490 if ($sectnode->{Key} eq 'release-support--xen-version') { 491 o(sprintf '<a href="%s">', $version_urls[$i]); 492 $end_a = '</a>'; 493 } 494 if (ref $st) { 495 $st = pandoc2html_inline $st; 496 } else { 497 $st = escapeHTML($st); 498 } 499 o($st); 500 o($end_a); 501 o('</td>'); 502 o($nextcell); 503 } 504 o("</tr>\n"); 505} 506 507sub write_output_sectlist ($); 508sub write_output_sectlist ($) { 509 my ($sectlist) = @_; 510 foreach my $key (keys %$sectlist) { 511 my $sectnode = $sectlist->{$key}; 512 push @pending_headings, $sectnode; 513 write_output_row $sectnode if $sectnode->{Status}; 514 write_output_sectlist $sectnode->{Children}; 515 } 516} 517 518sub write_output () { 519 o('<table rules="all">'); 520 write_output_sectlist $toplevel_sectlist; 521 o('</table>'); 522} 523 524#---------- main program ---------- 525 526open DEBUG, '>', '/dev/null' or die $!; 527if (@ARGV && $ARGV[0] eq '-D') { 528 shift @ARGV; 529 open DEBUG, '>&2' or die $!; 530} 531 532die unless @ARGV; 533die if $ARGV[0] =~ m/^-/; 534die if @ARGV % 2; 535 536read_inputs(); 537 538#use Data::Dumper; 539#print DEBUG Dumper($toplevel_sectlist); 540 541analyse_reprocess(); 542# Now Headline is in HTML 543 544count_rows_sectlist($toplevel_sectlist); 545 546#use Data::Dumper; 547print DEBUG Dumper($toplevel_sectlist); 548 549write_output(); 550