Jorj's home page: Technophilia

PIC microcontrollers

Ever since I opened up my Apple //e circa 1984 and figured out just how simplistic the circuitry was, I've been fascinated by micrprocessors. Once I taught myself 6502 assembly (around 1986), I became obsessed with being able to tinker with computers at a low level.

It shouldn't come as any surprise, then, that I find microcontrollers absolutely fascinating.

Around 1997, a couple of friends and I bought some equipment to program some PIC 17C43s (to crack RC5 keys, actually). I wrote some programs to drive a stepper motor using 16C84s, ran out of memory on the '43s and started using 17C44s, wrote infrared receivers on the '84s, and eventually created what I consider to be my crowning achievement: the 2000 Bike Trip Laptop Replacement (based on a '43).

From 2000 to 2004, I did basically nothing with PICs. I let the pieces sit in the attic with the rest of my electronics.

In 2002, I became interested in Amateur Radio. In early 2004, I found a new way to use PICs: to create an encoder for the Amateur Radio Positioning System.

While working out the details of an APRS encoder, I wanted to look at the assembly being generated by a C compiler. I found about a dozen different disassemblers... all for Windows.

So here's my contribution to the craft: a 16C84 disassembler written in Perl. Theoretically this should work for most PICs, and isn't necessarily 16C84-specific. Enjoy. (Update 1/2007: now contains program flow graphing, gpasm .map file parsing, and is tested against 16F62[78]a products.)

#!/usr/bin/perl

##########################################################################
#
# PIC 16C84, 16F84, 16F627, 16F628 (and related) disassembler
#
#     (c) 2004-2007 Jorj Bauer, 
#
# Placed in the public domain. Free for any (including commercial) use.
#
# Please be courteous and give me credit where credit may be due.
#
##########################################################################

use strict;
use Getopt::Std;
use GraphViz;

our ($opt_a, $opt_b, $opt_i, $opt_r, $opt_g, $opt_m, $opt_l);

# Default options for all graph nodes.
my %graphopts = ( fontname => 'luxisri',
		  labelfontname => 'luxisri',
		  fontsize => 12,
		  labelfontsize => 10,
		  );

my $version = '1.04';

# List of microcontroller instructions. The arrays are information about 
# how to disassemble the instruction:
#   [0] is the command (sans arguments);
#   [1] is the mask (to derrive the command or the arguments),
#   [2] is the type of operand. (Note that this could probably be derrived
#       from the mask, but I've decided to include it here in order to 
#       provide some more flexibility if a future modification to this 
#       program makes it harder to derrive these from the mask.)
#       The values are:
#           0 means it's a standalone opcode,
#           F means it has a file register,
#           FB means it has a file, bitnumber,
#           FD means it has a file, destination,
#           8K means it has an 8-bit k (literal) value,
#           11K means it has an 11-bit k (literal) value

my %instructions = ( 
		     'CLRW'   => [ 0x0100, 0xff80, 0 ],
		     'CLRWDT' => [ 0x0064, 0xffff, 0 ],
		     'NOP'    => [ 0x0000, 0xff9f, 0 ],
		     'OPTION' => [ 0x0062, 0xffff, 0 ],
		     'RETFIE' => [ 0x0009, 0xffff, 0 ],
		     'RETURN' => [ 0x0008, 0xffff, 0 ],
		     'SLEEP'  => [ 0x0063, 0xffff, 0 ],

		     'CLRF'   => [ 0x0180, 0xff80, 'F' ],
		     'MOVWF'  => [ 0x0080, 0xff80, 'F' ],

		     'BCF'    => [ 0x1000, 0xfc00, 'FB' ],
		     'BTFSC'  => [ 0x1800, 0xfc00, 'FB' ],
		     'BTFSS'  => [ 0x1c00, 0xfc00, 'FB' ],
		     'BSF'    => [ 0x1400, 0xfc00, 'FB' ],

		     'ADDWF'  => [ 0x0700, 0xff00, 'FD' ],
		     'ANDWF'  => [ 0x0500, 0xff00, 'FD' ],
		     'COMF'   => [ 0x0900, 0xff00, 'FD' ],
		     'DECF'   => [ 0x0300, 0xff00, 'FD' ],
		     'DECFSZ' => [ 0x0b00, 0xff00, 'FD' ],
		     'INCF'   => [ 0x0a00, 0xff00, 'FD' ],
		     'INCFSZ' => [ 0x0f00, 0xff00, 'FD' ],
		     'IORWF'  => [ 0x0400, 0xff00, 'FD' ],
		     'MOVF'   => [ 0x0800, 0xff00, 'FD' ],
		     'RLF'    => [ 0x0d00, 0xff00, 'FD' ],
		     'RRF'    => [ 0x0c00, 0xff00, 'FD' ],
		     'SUBWF'  => [ 0x0200, 0xff00, 'FD' ],
		     'SWAPF'  => [ 0x0e00, 0xff00, 'FD' ],
		     'XORWF'  => [ 0x0600, 0xff00, 'FD' ],

		     'ADDLW'  => [ 0x3e00, 0xfe00, '8K' ],
		     'ANDLW'  => [ 0x3900, 0xff00, '8K' ],
		     'IORLW'  => [ 0x3800, 0xff00, '8K' ],
		     'MOVLW'  => [ 0x3000, 0xfc00, '8K' ],
		     'RETLW'  => [ 0x3400, 0xfc00, '8K' ],
		     'SUBLW'  => [ 0x3c00, 0xfe00, '8K' ],
		     'XORLW'  => [ 0x3a00, 0xff00, '8K' ],

		     'CALL'   => [ 0x2000, 0xf800, '11K' ],
		     'GOTO'   => [ 0x2800, 0xf800, '11K' ],
		     );

# Names of the basic file registers on the 16C84.
# Theoretically, user-defined file register variables could be named and 
# mapped into this space. Sounds like a future enhancement...

my %registers = ( 0x00 => 'INDF',
		  0x01 => 'TMR0',
		  0x02 => 'PCL',
		  0x03 => 'STATUS',
		  0x04 => 'FSR',
		  0x05 => 'PORTA',
		  0x06 => 'PORTB',
		  0x08 => 'EEDATA',
		  0x09 => 'EEADR',
		  0x0A => 'PCLATH',
		  0x0B => 'INTCON',

		  0x80 => 'INDF',
		  0x81 => 'OPTION',
		  0x82 => 'PCL',
		  0x83 => 'STATUS',
		  0x84 => 'FSR',
		  0x85 => 'TRISA',
		  0x86 => 'TRISB',
		  0x88 => 'EECON1',
		  0x89 => 'EECON2',
		  0x8A => 'PCLATH',
		  0x8B => 'INTCON',
		  );

# Names of the bits of the common registers.
my %bits = ( 'STATUS' => [ 'C', 'DC', 'Z', '/PD', '/TO', 'RP0', 'RP1', 'IRP' ],
	     'PORTA'  => [ 'RA0', 'RA1', 'RA2', 'RA3', 'RA4/T0CKI', 5, 6, 7 ],
	     'PORTB'  => [ 'RB0', 'RB1', 'RB2', 'RB3', 'RB4', 'RB5', 'RB6',
			   'RB7' ],
	     'INTCON' => [ 'RBIF', 'INTF', 'T0IF', 'RBIE', 'INTE', 'T0IE',
			   'EEIE', 'GIE' ],
	     'OPTION' => [ 'PS0', 'PS1', 'PS2', 'PSA', 'T0SE', 'T0CS', 
			   'INTEDG', '/RBPU' ],
	     'EECON1' => [ 'RD', 'WR', 'WREN', 'WRERR', 'EEIF', 5, 6, 7 ],
	     );
	     
# Labels for well-known addresses. Again, the users' subroutine labels could
# be mapped into this array.
my %labels = ( 0x0000 => 'RESETVEC',
	       0x0004 => 'INTRVEC',
	       0x2007 => 'CONFIG',
	       0x2100 => 'EEPROM',
	       );

# Memory locations that shouldn't be disassembled, but just printed as 'dw ...'
# 0x2007 is the configuration bits, and 0x2100+ are the EEPROM data.
# Note that the DWs and max_program_space should probably be combined, and 
# turned into something less processor-dependent. FIXME (eventually).
my @DWs = ( 0x2007, 0x2100..0x2200 );
my $MAX_PROGRAM_SPACE = 0x2000;

# @memory contains the entire program.
our @memory;

# %mask is a mapping of whether or not we want to disassemble any particular
# range of bytecode. It's populated as the file is read in.
my %mask;

# $codesize is the index of the highest word of code we've read in. This could
# be gotten to by looking at the size of @memory, but at some point maybe I'll
# come up with a more efficient way to deal with the program... and then it 
# will be important to know what the size of the code is.
my $codesize = 0;

# We try to keep track of the state of the RP0 bit (page state), so that we
# can show the file register being dealt with more accurately. This isn't
# going to be perfect, but it'll be a start.
my $RP0 = 0;

getopts('b:i:m:r:alg:');

usage() if (! ($opt_b || $opt_i));

make_opcodes_lowercase() if ($opt_l);

if ($opt_b) {
    $codesize = read_binary($opt_b);
} elsif ($opt_i) {
    $codesize = read_intelhex($opt_i);
} else {
    usage();
}

if ($opt_r) {
    load_register_file($opt_r);
}

if ($opt_m) {
    load_map_file($opt_m);
}

my $graph;
if ($opt_g) {
    $graph = GraphViz->new();
}

# First pass: parse each instruction and determine if it refers to another 
# chunk of memory. If it does, we'll create a label for that chunk of memory.
my $address = 0;
my $last_label;
do {
    my $word = $memory[$address];

    my $opcode = find_opcode($word);

    if (uc($opcode) eq 'CALL' || uc($opcode) eq 'GOTO') {
	# Find the referenced address and label it if it's not already labelled

	my $target = $word & 0x7FF;

	my $label = make_label($target);

	if ($label !~ /^L[0-9A-Fx]+/) {
	    $last_label = $label;
	} else {
	    if ($opt_g) {
		# Anonymous labels have an implicit edge from the last 
		# non-anonymous label.
		$graph->add_edge($last_label => $label,
				 dir => 'forward',
				 style => 'dotted',
				 color => 'black',
				 %graphopts,
				 );
	    }
	}
	if ($opt_g) {
	    # graphing, so add it
	    if (!$labels{$target}) {
		$labels{$target} = $label;
		$graph->add_node($label,
				 %graphopts,
				 );
	    }
	    if (!$labels{$address}) {
		# Most of the anonymous nodes are created here...
		$labels{$address} = make_label($address);
		$graph->add_node($labels{$address},
				 %graphopts,
				 );
	    }

	    # Add a line for the call to the subroutine or goto statement.
	    if (is_goto($address)) {
		$graph->add_edge($labels{$address} => $label,
				 %graphopts,
				 dir => 'forward',
				 style => 'solid',
				 color => 'black',
				 );
	    } else {
		$graph->add_edge($labels{$address} => $label,
				 dir => 'both',
				 style => 'dashed',
				 color => 'red',
				 %graphopts,
				 );
	    }

	} else {
	    # not graphing, so it's simple...
	    $labels{$target} = $label unless ($labels{$target});
	}
    }

    $address++;
} while ($address < $codesize);

# Add graph edges for all of the postponed fallthrough labels.
if ($opt_g) {
    # we only want to "fall through" (create continuance labels) if the 
    # previous instruction wasn't an unconditional branch (i.e. if there's
    # no interruption in the program flow).

    foreach my $i (sort {$b <=> $a} keys(%labels)) {
	next if ($i >= $MAX_PROGRAM_SPACE || $i <= 5);

	$graph->add_edge(find_previous_label(\%labels, $i) => $labels{$i},
			 dir => 'forward',
			 style => 'dotted',
			 color => 'magenta',
			 %graphopts)
	    unless (unconditional_branch_before($i));
    }

}

# Second pass: go back through and print the disassembly.
$address = 0;
my $insegment = 0; # if 0, we need to print an 'ORG'
do {
    if ($mask{$address}) {
	if (!$insegment) {
	    print sprintf("\n%s%-16.16s %-8.8s 0x%.4X\n\n",
			  format_address($address), 
			  "", 
			  $opt_l?"org":"ORG",
			  $address);
	    $insegment = 1;
	}

	my $word = $memory[$address];
	
	my $disassembly = disassemble_instruction($address, $word);

        if ($RP0 == 1) {
           $disassembly = sprintf("%-40.40s %s",
                                  $disassembly,
                                  "; NOTE: Working in Page 1");
        }

        print $disassembly, "\n";

	# Make the output pretty. If it was a RETFIE / RETURN / GOTO et al and
	# the next instruction has a label, we'll print an extra newline.
	my $opcode = uc(find_opcode($word));

	if ( ($opcode eq 'GOTO' || $opcode eq 'RETURN' || 
	      $opcode eq 'RETFIE' || $opcode eq 'RETLW')
	     &&
	     defined $labels{$address+1} ) {
	    print "\n";
	}

    } else {
	$insegment = 0;
    }

    $address++;
} while ($address < $codesize);

# If graphing, save the graph
if ($opt_g) {
  open(GIF, ">", $opt_g) || die "Unable to open '$opt_g' for writing: $!";
  binmode GIF;
  print GIF $graph->as_gif();
  close GIF;
}

exit (0);

sub find_previous_label {
    my ($labelsref, $address) = @_;

    foreach my $i (sort {$b <=> $a} keys %$labelsref) {
	return $labelsref->{$i}
	    if ($i < $address);
    }
    return 'unknown';
}

# return true if there's an unconditional branch just before this address.
# (used to compute flow diagrams.)
sub unconditional_branch_before {
    my ($address) = @_;

    # Is the previous instruction a return/goto/retfie/retlw?
    my $word = $memory[$address-1];
    my $opcode = uc(find_opcode($word));
    if ( ($opcode eq 'GOTO' || $opcode eq 'RETURN' || 
	  $opcode eq 'RETFIE' || $opcode eq 'RETLW') ) {
	# Check out the opcode before that and see if it's a skip
	$word = $memory[$address-2];
	$opcode = uc(find_opcode($word));

	return 1
	    if ($address == 1); # Can't check for the opcode before...

	if ($opcode eq 'BTFSS' || $opcode eq 'BTFSC' || $opcode eq 'DECFSZ') {
	    # Conditional skip; return 0.
	    return 0;
	} else {
	    # There's no way around it. The previous statement returned.
	    return 1;
	}
    }

    # If we get here, there was either no branch before this statement, or 
    # there was a branch that was conditional.
    return 0;
}

# return true if the statement at the given address is a GOTO (not a CALL)
# (used to compute flow diagrams.)
sub is_goto {
    my ($address) = @_;
    my $word = $memory[$address];
    my $opcode = uc(find_opcode($word));
    return ($opcode eq 'GOTO');
}

sub find_opcode {
    my ($word) = @_;

    foreach my $opcode (keys(%instructions)) {
	my $info = $instructions{$opcode};
	if (($word & $info->[1]) == $info->[0]) {
	    return $opcode;
	}
    }

    return "???";
}

sub get_type {
    my ($opcode) = @_;

    return $instructions{$opcode}->[2];
}

sub disassemble_instruction {
    my ($address, $word) = @_;

    return sprintf("%s%-16.16s %-8.8s 0x%.2X",
		   format_address($address),
		   $labels{$address},
		   $opt_l?'dw':'DW',
		   $word)
	if (is_dwaddr($address));

    my $opcode = find_opcode($word);
    my $type = get_type($opcode);
    my $base = sprintf("%s%-16.16s %-8.8s", 
		       format_address($address), 
		       $labels{$address},
		       $opcode,
                       );

    if (!$type) {
	return $base;
    } elsif ($type eq 'F') {
	# If the RP0 flag is set for page 1, then add 0x80
	my $register = $word & 0x7F;
	$register += 0x80 if ($RP0 == 1);

	return sprintf("%s %s", $base, make_regname($register));
    } elsif ($type eq 'FD') {
	# If the RP0 flag is set for page 1, then add 0x80
	my $register = $word & 0x7F;
	$register += 0x80 if ($RP0 == 1);

	return sprintf("%s %s, %s", 
		       $base, 
		       make_regname($register),
		       (($word & 0x80) == 0) ? "W" : "F"
		       );
    } elsif ($type eq 'FB') {
	# If the RP0 flag is set for page 1, then add 0x80
	my $register = $word & 0x7F;
	$register += 0x80 if ($RP0 == 1);

	my $bitnum = ($word >> 7) & 7;
	my $registername = make_regname($register);

	# If the opcode is BSF or BCF, and the F is STATUS,
	# and the bitnumber is RP0, then update the current RP0 flag

	if ($opcode =~ /^b[sc]f/i) {
	    if ( make_bitname($registername, $bitnum) =~ /^rp0$/i) {
		if ($opcode =~ /^bsf/i) {
		    $RP0 = 1;
		} else {
		    $RP0 = 0;
		}
	    }
	}

	return sprintf("%s %s, %s", 
		       $base, 
		       $registername, 
		       make_bitname ($registername, $bitnum) );
    } elsif ($type eq '8K') {
	return sprintf("%s 0x%02X", $base, $word & 0xFF);
    } elsif ($type eq '11K') {
	return sprintf("%s %s", $base, make_label($word & 0x7FF));
    }

    # Shouldn't ever reach here...

    return "???";
}

sub make_regname {
    my ($register) = @_;

    # If a register name is in the %registers hash, then return it. Otherwise
    # we just return the hex version of the number.

    return $registers{$register} if ($registers{$register});

    return sprintf("0x%02X", $register);
}

sub make_bitname {
    my ($regname, $bit) = @_;

    # If we can find a mapping for the bit name in the %bits hash, return it.
    # Otherwise just return the bit number.

    return $bits{$regname}->[$bit] if (defined $bits{$regname}->[$bit]);

    return $bit;
}

sub make_label {
    my ($address) = @_;

    # Turn an address into a unique label name. Currently, this does it 
    # by directly using the target address.

    return $labels{$address} if ($labels{$address});

    return sprintf("L%.4X", $address);
}

sub is_dwaddr {
    my ($addr) = @_;

    # Return whether or not the value is in the @DWs array.

    foreach my $i (@DWs) {
	return 1 if ($addr == $i);
    }

    return 0;

}

sub usage {
    print "Usage: $0 [-a] <-b  | -i >\n".

	"\t-a:            show a column of addresses\n".
	"\t-b : read from binary file \n".
	"\t-i : read from Intel Hex file \n".
	"\t-g : graph, and save the gif as \n".
	"\t-l:            show opcodes in lowercase\n".
	"\t-m : load register hints from mapfile \n".
	"\t-r : load register name hint file \n".
	"\n"
	;
    
    exit(-1);
}

# read_binary: read in a little-endian binary file.
# Drawback: can't tell what areas of memory to NOT disassemble (empty
#           regions).

sub read_binary {
    my ($filename) = @_;
    my $byte, $codesize;

# Read in the entire file and populate the memory array.
    open (FILE, $filename) || die "Can't open '$filename': $!";
    binmode(FILE);
    
    while ( read(FILE, $byte, 2) == 2) {
	$mask{$codesize} = 1;
	$memory[$codesize++] = unpack("v", $byte);
    }
    close FILE;

    return $codesize;
}

# read_intelhex: read in an Intel Hexfile format, as generated by 
# picl and presumably MPASM. This lets us also tell which areas of
# memory are just holes, and shouldn't be disassembled.

sub read_intelhex {
    my ($filename) = @_;
    my ($line, $codesize, $num, $addr, $type, $data);

    open (FILE, $filename) || die "Can't open '$filename': $!";
    # Specifically NOT binmode.

    while ($line = ) {
	chomp;
	$line =~ /^:(..)(....)(..)(.*)$/;
	$num = hex($1);
	$addr = hex($2);
	$type = hex($3);
	$data = $4;

	# The Intel hex format is byte-based, but the PIC microcontroller
	# is word-based. We read in all of the data one byte at a time, and
	# make a determination as to whether it is the low or high word of
	# the memory word. (The Intel format is little-endian.)

	if ($type == 0) {
	    for my $i (0..$num-1) {
		my $offset = 0; # number of bytes to shift left
		my $byte = substr($data, $i*2, 2);
		$byte = hex $byte;
		if (($addr + $i) & 0x01) {
		    $offset = 8; # if it's an odd byte, it's a high byte
		}

		$memory[ ($addr + $i) / 2 ] =
		    $memory[ ($addr + $i) / 2 ]  | ($byte << $offset);

		$mask{ int (($addr + $i) / 2) } = 1;
		$codesize = int (($addr + $i) / 2) 
		    if (int(($addr + $i)/2) > $codesize);
	    }
	}
    }
    close FILE;

    return $codesize;
}

sub format_address {
    my ($address) = @_;

    return sprintf("0x%.4X ", $address) if ($opt_a);

    return "";
}

sub make_opcodes_lowercase {
    foreach my $i (keys(%instructions)) {
	my $newkey = lc($i);
	$instructions{$newkey} = $instructions{$i};
	delete $instructions{$i};
    }
   
}

sub load_register_file {
    my ($file) = @_;

    # Read in a file that contains whitespace-delimited varable name / 
    # address information.

    open(FILE, $file) || die "Can't open register file '$file': $!";
    while () {
	chomp;
	my ($var, $addr)  = split(/\s+/);

	if ($addr =~ /^L(.+)$/) {
	    # It's a label, not a var.
	    $addr = hex($1);
	    $labels{$addr} = $var;
	} else {
	    # It's a var.
	    $addr = hex($addr) if ($addr =~ /^0x/i);
	    $registers{$addr} = $var;
	}
    }
    close FILE;
}

sub load_map_file {
    my ($file) = @_;

    # Find anything that looks like an address, and create a dummy register 
    # hint file. Then load that.

    my $tmpfile = "/tmp/out.$$"; # FIXME this is a lousy temp file.
    open(OUT, ">", $tmpfile) || die "Can't open temp file '$tmpfile': $!";

    open(FILE, $file) || die "Can't open map file '$file': $!";
    while () {
	chomp;
	my ($var, $addr, $loc, $other) = /^(.{25})(.{11})(.{11})(.+)$/;
	$var =~ s/ //g;
	$addr =~ s/ //g;
	$loc =~ s/ //g;
	if ($addr =~ /0x/) {
	    if ($loc eq 'program') {
		print OUT "$var\tL$addr\n"; # note the L
	    } elsif ($loc eq 'data') {
		print OUT "$var\t$addr\n";
	    }
	}
    }
    close FILE;
    close OUT;

    load_register_file($tmpfile);
    unlink $tmpfile;
}