#!/usr/local/bin/perl --
eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'
  if $running_under_some_shell;
#
# NAME:
#	cmt2doc - extract documentation from source
#
# SYNOPSIS:
#	cmt2doc [-pamit][-e "oext"][-S "secn"][-D "secd"][-O "org"]
#		[-L "lang"][-C "cmt"][-E "ecmt"] "file"
#
# DESCRIPTION:
#	This 'Perl' script extracts documentation from comments	in
#	source files.  It allows manual pages to be written in ``plain
#	text'' in source files where they are most likely to be updated
#	when the source code is.
#
#	'cmt2doc' extracts the documentation as either clean text or as
#	input suitable for 'troff'(1) and friends.   The results in
#	either case are usually quite adequate.  Try the following
#	commands:
#.nf
#
#		'perl cmt2doc.pl -p cmt2doc.pl | more'
#		'perl cmt2doc.pl -pm cmt2doc.pl | nroff -man | more'
#.fi
#
#	'cmt2doc' can usually work out for itself how to extract the
#	text from a comment.  It looks for the regular expression 
#	'.* NAME:$' which it treats as the start of a manual page, and
#	uses what ever is found before 'NAME' as the characters to
#	remove from the start of each line. 
#
#	Typographical conventions:
#.nf
#		Words like \'this word\' will be type-set in 'bold'.
#		Words like \"this word\" will be set in "italics".
#		Words like ``this quote'' will not be touched.
#.fi
#
#	It is possible to put 'troff' commands at the start of an
#	otherwise blank line.  Indeed they are sometimes needed such as
#	when setting out examples.  They will be stripped if not
#	generating 'troff' output.  
#
#	'cmt2doc' understands the format required for most manual page
#	sections and attempts to set them appropriately.
#
# OPTIONS:
#	-p	print to stdout.  By default documentation for
#		"file" will be printed to a file in the current
#		directory of the same name but with an extention
#		that represents the format (.doc,.man,.tex). 
#
#	-a	print all documentation, not just the top level.
#
#	-m	Output for 'troff -man'.
#
#	-i	Output for texinfo (no yet implemented).
#
#	-t	``Plain text'' strip single quotes.  Leave double quotes
#		alone though.
#
#	-e "oext"
#		Use "oext" as the extension for the output file.
#
#	-k	Keep the file extension, thus we get cmt2doc.pl.8 rather
#		than cmt2doc.8, this does not apply to source files
#		like .c, and .cc etc.
#
#	-S "secn"
#		Tell 'troff' which section the man page belongs in.
#		Default is 'L'.
#
#	-L "lang"
#		Select default values for "cmt" and "ecmt" based on
#		"lang" ('c','c++','lisp').  Most shell like languages
#		such as 'perl' and 'sh' are easily handled by the defaults.
#
#	-D "secd"
#		Use "secd" as the section description.
#
#	-C "cmt"
#		Assume the comment lines start with "cmt".  Otherwise
#		we attempt to work it out either based on the file
#		extention (.c,.h,.cc etc) or from the comment itself.
#
#	-E "ecmt"
#		The comment ends when we see "ecmt" otherwise the first
#		line that does not start with "cmt".
#
#	-O "org"
#		Use "org" as the organization identifier (printed bottom
#		left of each page).
#
#	Some options only apply to certain output modes.
#
# FILES:
#	/usr/local/bin/perl		The perl interpreter.  This entry
#				is really just to show how 'cmt2doc'
#				handles the 'FILES' section.
#	/local/bin/cmt2doc.pl	This script. "ditto".
#
# BUGS:
#	It probably does not handle nested quotes correctly.
#	Lines starting with a \'.\' are in trouble.
#	For good results it is hard to avoid using 'troff' commands,
#	particularly '.nf' and '.fi'.
#	
#	Handling of '.TH' seems to vary with different man macro sets.
#	You may have to hack 'man_init' to get good results.
#
#	The regexp for recognizing lists in the DESCRIPTION section is
#	rather complex and while it handles this and many other man
#	pages correctly may not always get it right.  If you end up
#	with spurious .TP commands look in man_para.
#
# AUTHOR:
#	'cmt2doc' was written by Simon J. Gerraty <sjg@zen.void.oz.au>
#

#
# RCSid:
#	$Id: cmt2doc.pl,v 1.21 1998/06/27 14:05:27 sjg Exp $
#
#	@(#)Copyright (c) 1992, Simon J. Gerraty
#
#	This file is provided in the hope that it will
#	be of use.  There is absolutely NO WARRANTY.
#	Permission to copy, redistribute or otherwise
#	use this file is hereby granted provided that 
#	the above copyright notice and this notice are
#	left intact. 
#      
#	Please send copies of changes and bug-fixes to:
#	sjg@zen.void.oz.au
#

$Myname=$0;
$Myname=~ s#^.*/([^/]*)$#$1#;

# some defaults
$do_init='txt_init';
$do_fini='noop';
$do_sec='txt_sec';
$do_para='noop';
$do_line='txt_line';

$man_secn='L';			# local commands
$oext='.doc';
$Debug = 0;
$start_para='';
$indent=0;
$defPD='.8v';

$date=&get_date;
$org='FreeWare';		# be sure to set this!

@AlwaysStrip = ('.c','.cc','.h','.y','.l','.el','.pc');

require 'getopts.pl';
&Getopts('A:dpamitke:L:S:C:E:D:O:');

push(@AlwaysStrip, (split(/[\s,]/, $opt_A))) if ($opt_A ne '');

$org=$opt_O if defined($opt_O);
$cmt=$opt_C if defined($opt_C);
$ecmt=$opt_E if defined($opt_E);
# redefine the necessary functions
if (defined($opt_m)) {	# [tn]roff -man
  if ($opt_S ne '') {
    $oext = ".$opt_S";
  } else {
    $oext = '.man';
  }
  $do_init='man_init';
  $do_para='man_para';
  $do_sec='man_sec';
  $do_line='man_line';
} elsif (defined($opt_i)) {	# texinfo
  $oext = '.tex';
  $do_init='texi_init';
  $do_fini='texi_fini';
  $do_sec='texi_sec';
  $do_line='texi_line';
}
$man_secn=$opt_S if defined($opt_S);
if (defined($opt_D)) {
  $man_secd=$opt_D;
} else {
  $man_secd=&lookup_mansec($man_secn);
}
$oext=$opt_e if defined($opt_e);
$Debug = 1 if defined($opt_d);
$Lang=$opt_L if defined($opt_L);


$indoc=0;
$in_para = 0;

foreach $k (@AlwaysStrip) {
  $AlwaysStrip{$k} = 1;
}

FILE: foreach $file (@ARGV) {
  print STDERR "doing $file\n" if $Debug > 0;
  $name="./$file";
  $name=~s#^.*/([^/]*)$#$1#;
  $ext=$name;
  $ext=~s/.*(\.[^.]*)$/\1/;

  if (!defined($opt_L)) {
    $Lang='c' if ($ext =~ m/\.[ch]$/);
    $Lang='c++' if ($ext =~ m/\.(cc|C|H)$/);
    $Lang='lisp' if ($ext =~ m/\.el$/);
  }
  if (defined($Lang)) {
    if (!defined($cmt)) {
      if ($Lang eq 'c') {
	$cmt = '[/ ]\*';
      } elsif ($Lang eq 'c++') {
	$cmt = '(//|[/ ]\*)';
      } elsif ($Lang eq 'lisp') {
	$cmt = ';+';
      }
    }
    if (!defined($ecmt)) {
      if ($Lang eq 'c' || $Lang eq 'c++') {
	$ecmt = ' *\*/';
      }
    }
  }
  if (!defined($opt_p)) {
    $ofile = $name;		# we've already stripped dirname
    $ofile =~ s#\.[^/.]+$## if ($opt_k eq '' || $AlwaysStrip{$ext} ne '');
    $ofile .= $oext;
    print STDERR "Output to $ofile\n" if $Debug > 0;
    open(STDOUT, "> $ofile") || die "can't redirect STDOUT: $!\n";
  }
  if (!open(F, "< $file")) {
    print STDERR "can't open $file: $!\n";
    next FILE;
  }
  LINE: while (<F>) {
    chop;

    if ($indoc == 0 && m/ *NAME:$/) {
      if (!defined($cmt)) {
	$cmt = $_;
	$cmt =~ s/^(.*) NAME.*/\1/;
	$cmt='' if ($_ eq 'NAME:');
      }
      $indoc = 1;
      $in_para = 0;
      &$do_init;
    }
    next if ($indoc == 0);
    # we are inside doc section
    if ($_ !~ m@^$cmt@ || (defined($ecmt) && $_ =~ m@$ecmt@)) {
      $indoc = 0;
      &$do_fini;
      if (defined($opt_a)) {
	next LINE;
      } else {
	next FILE;
      }
    }
    s@^$cmt ?@@;
    $needout = 1;
    if (m/^[A-Z][A-Za-z _-]+:$/) {
      &$do_sec;
    } elsif (m/^[ \t]*$/) {
      $in_para = 0;
      if (defined($opt_m)) {
	$needout = 0;
      }
    } else {
      if ($in_para == 0) {
	$in_para = 1;
	&$do_para;
      }
      &$do_line;
    }
    print "$_\n" if ($needout > 0);
  }
  close F;
}
exit 0;

# for plain text these are noops
sub noop {
}

sub txt_init {
  local($i,$c);
  $llength = 65;
  $c = 0;
  
  $nm=$name;
  $nm=~s/\.[^.]*$//;
  $nm =~ tr/[a-z]/[A-Z]/;
  $nm = "$nm($man_secn)";
  print "\n$nm";
  $c += length($nm);
  $i = int(($llength - length($man_secd))/ 2);
  while ($c < $i) {
    $c++;
    print " ";
  }
  print "$man_secd";
  $c += length($man_secd);
  $i = $llength - length($nm);
  while ($c < $i) {
    $c++;
    print " ";
  }
  print "$nm\n\n\n";
}

sub txt_sec {
  # just loose the trailing ':'
  $sec = $_;
  $sec =~ s/ *([A-Z][A-Za-z _-]*):/\1/;
  $_ = $sec;
  $in_para = 0;
}

sub txt_line {
  $needout = 0 if (m/^\.\w+/);	# strip nroff commands
  if (defined($opt_t)) {
    # strip 'word' to just word.
    # a bit of trickery to avoid ``quotes'' and \'word\'.
    s/^'([^']+)'/\1/g;	# 'bold'
    s/([^'\\])'([^']+)'/\1\2/g;	# 'bold'
  }
  s/\\[ &~]//g;			# strip troff only gunk
  s/\\([.'"\\])/\1/g;		# strip \\ \. \' and \" to \ . ' and "
}


sub man_init {
  print ".\\\" extracted from $file by $Myname\n";
  
  $nm=$name;
  $nm=~s/\.[^.]*$//;
  $nm =~ tr/[a-z]/[A-Z]/;
  $mtime = (stat($file))[9];
  $mdate = &get_date($mtime);
  # some tmac.an macros don't support $org HP-UX for example.
  # But most do.  Just comment out setting of $org above.
  if (defined($org)) {
    print ".TH $nm $man_secn \"$mdate\" \"$org\" \"$man_secd\"\n";
  } else {
    print ".TH $nm $man_secn \"$mdate\" \"$man_secd\"\n";
  }
  # just to be sure
  print ".PD $defPD\n";
}

sub man_sec {
  &man_indent(0);		# make sure indentation is back to 0

  if ($start_para eq '.nf') {
    print ".fi\n";
    $last_troff = '.fi';
  }
  if ($sec eq 'FILES') {
    # previous section was FILES
    # restore inter-paragraph distance
    print ".PD $defPD\n";
  }
  # get new section name.
  $sec = $_;
  $sec =~ s/ *([A-Z][A-Za-z _-]*):/\1/;

  if ($sec ne 'NAME') {
    print "\n";
  }
  if ($sec =~ m/ /) {
    print ".SH \"$sec\"\n";
  } else {
    print ".SH $sec\n";
  }
  if ($sec eq 'FILES') {
    # little or no gap between paragraphs.
    # so it looks like it should.
    print ".PD .1v\n";
  }
  $needout = 0;
  $in_para = 0;
}

# this gets a little messy
sub man_para {
  if (m/^\.\w+/) {
    # a [tn]roff command, next line is start of para
    $in_para=0;
    $last_troff=$_;
    $last_troff=~ s/^(\.\w+).*/\1/;
    return;
  }
  if ($last_troff eq '.nf') {
    $start_para = '';
  } else {
    if ($sec =~ m/DESCRIPTION|OPTIONS/ &&
	# this monster regexp, matches lines like:
	# 	-v	verbose
	#		-p "printer"
	#		-n "num"	Number
	#		VARIABLE	Description
	#		LONG_VARIABLE_NAME
	# etc.
	# I.e. an initial word or word space word followed by a TAB or
	# end of line.  It _usually_ means we are doing a list...
	# It will also match many troff commands, but these are caught
	#	earlier.
	#	We use [^ \t] etc rather than \w, because we do want various
	#	quotes in the ``word''s.
	m/^\s*[^ \t]+( [^ \t,.;:]+)?(\t.*)?[^ \t,.;:]*$/
	) {
      $start_para = '.TP' unless($start_para =~ m/^.TP/);
    } elsif ($sec eq 'FILES') {
      $start_para = '.TP 30';
#   } elsif ($sec =~ m/NAME|SYNOPSIS/) {
    } elsif ($sec eq 'SYNOPSIS') {
      $start_para = '.nf';
    } elsif ($start_para =~ m/\.TP/) {
      $start_para = '.PP';
    } else {
      $start_para = '';
    }
  }
  if ($start_para ne '') {
    $last_troff = $start_para;
  }
  print "$start_para\n" if ($needout > 0);
  if (!defined($TextOffset) && $sec eq 'NAME') {
    # identifies base offset to strip fom each line
    $TextOffset = $_;
    $TextOffset =~ s/^([ \t]+)[^ \t].*/\1/;
  }
  # handle indented paras
  if ($start_para !~ m/\.TP/ && m/^\t/) {
    &man_indent(-1);
  }
}


# we have to do more that we would like here, to
# set 'bold' and "italics" but not to harm \'words\'
# \"words\" and ``quotes''.
sub man_line {
  # man_para will have been called once already
  # so first time in after a new para, $in_para==1.
  # in here we can set it to other values to indicate
  # a need to force a new para, or adjust indentation.
  if ($in_para == 3) {
    &man_indent(-1);
    $in_para=1;
  }
  if (m/^\.\w+/) {
    # a [tn]roff command
    $last_troff=$_;
    $last_troff=~ s/^(\.\w+).*/\1/;
    $in_para=3;
    if (m/^.TP/) {
      $start_para=$_;		# good idea?
    }
    return;
  }
  s/([^\\].)(\.[.\w])/\1\\&\2/g;	# protect .foo in body of text from troff
  s/^$TextOffset// if defined($TextOffset);
  if ($start_para =~ m/^.TP/ &&
      m/^\s*[^ \t]+( [^ \t,.;:]+)?(\t.*)?[^ \t,.;:]*$/) {
      if ($in_para == 2) {
        &man_para;
      }
      $in_para = 2;
      s/^[ \t]*//;
      # format options correctly
      s/^([^'" \t]+)/'\1'/ if ($sec =~ m/DESCRIPTION|OPTIONS/ && m/^-/);
      s/\t+([^ \t])/\n\1/;
      s/\t/ /g;
  }
  s/^[ \t]*// if ($last_troff ne '.nf');
  s/\t/ /g if ($last_troff ne '.nf'); 
  if ($sec eq 'SYNOPSIS') {
    s/^(\w\S+)/'\1'/;
    s/(-\w+)/'\1'/g if (m/\[/);
  }
  s/([ '"])-/\1\\-/g;
  s/^"([^"]+)"/\\fI\1\\fR/g;	# "italic"
  # avoid \"word\"
  s/([^\\])"([^"]*[^\\])"/\1\\fI\2\\fR/g;	# "italic"
  # a bit of trickery to avoid ``quotes'' and \'word\'.
  s/^'([^']+)'/\\fB\1\\fR/g;	# 'bold'
  s/([^'\\])'([^']+)'/\1\\fB\2\\fR/g;	# 'bold'
  # now make \['"] into just ' or "
  s/\\(['"])/\1/g;
}

# adjust the indent level
sub man_indent {
  local($i) = @_;
  local($itabs,@tabs);

  return  if ($last_troff eq '.nf');
  
  if ($i < 0) {
    # calculate required indent level
    $itabs=$_;
    $itabs =~ s/^(\t+)[^\t].*/\1/;

    if (defined($TextOffset)) {
      $itabs=~ s/^$TextOffset//;
    }
    @tabs=split(/\t/,$itabs, 10);
    $i = $#tabs; # - 1;
    if (!defined($TextOffset)) {
      $i--;
    }
  }
  if ($i >= 0) {
    if ($i != $indent) {
      $last_troff='';
    }
    while ($indent < $i) {
      $indent++;
      print ".RS\n";
    }
    while ($indent > $i) {
      $indent--;
      print ".RE\n";
    }
  }
}


sub lookup_mansec {
  local($n) = @_;
  local($d);
  %s = ( 'default', 'MISC. REFERENCE MANUAL PAGES',
	'1', 'USER COMMANDS',
	'1C', 'USER COMMANDS',
	'1G', 'USER COMMANDS',
	'1S', 'USER COMMANDS',
	'1V', 'USER COMMANDS',
	'2', 'SYSTEM CALLS',
	'2V', 'SYSTEM CALLS',
	'3', 'C LIBRARY FUNCTIONS',
	'3C', 'COMPATIBILITY FUNCTIONS',
	'3F', 'FORTRAN LIBRARY ROUTINES',
	'3K', 'KERNEL VM LIBRARY FUNCTIONS',
	'3L', 'LIGHTWEIGHT PROCESSES LIBRARY',
	'3M', 'MATHEMATICAL LIBRARY',
	'3N', 'NETWORK FUNCTIONS',
	'3R', 'RPC SERVICES LIBRARY',
	'3S', 'STANDARD I/O FUNCTIONS',
	'3V', 'C LIBRARY FUNCTIONS',
	'3X', 'MISCELLANEOUS LIBRARY FUNCTIONS',
	'4', 'DEVICES AND NETWORK INTERFACES',
	'4F', 'PROTOCOL FAMILIES',
	'4I', 'DEVICES AND NETWORK INTERFACES',
	'4M', 'DEVICES AND NETWORK INTERFACES',
	'4N', 'DEVICES AND NETWORK INTERFACES',
	'4P', 'PROTOCOLS',
	'4S', 'DEVICES AND NETWORK INTERFACES',
	'4V', 'DEVICES AND NETWORK INTERFACES',
	'5', 'FILE FORMATS',
	'5V', 'FILE FORMATS',
	'6', 'GAMES AND DEMOS',
	'7', 'ENVIRONMENTS, TABLES, AND TROFF MACROS',
	'7V', 'ENVIRONMENTS, TABLES, AND TROFF MACROS',
	'8', 'MAINTENANCE COMMANDS',
	'8C', 'MAINTENANCE COMMANDS',
	'8S', 'MAINTENANCE COMMANDS',
	'8V', 'MAINTENANCE COMMANDS',
	'1L', 'LOCAL COMMANDS',
	'L', 'LOCAL COMMANDS'
	) if (!defined(%s));

  $d = $s{$n};
  if (!defined($d)) {
    $d = $s{'default'};
  }
  $d;
}

sub get_date {
  local($secs) = @_;
  $secs = time if ($secs eq '');
  
  @months = ('January','February','March','April','May',
	     'June','July','August','September','October',
	     'November','December');
  ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$idst) =
    localtime($secs);
  if ($year < 70) {
    $cent='20';
  } else {
    $cent = '19';
  }
  $month = $months[$mon];
  "$mday $month $cent$year";
}
