
/* Hive Spin Compiler lexical analysis */

#include "lexer.h"
#include "keyword.h"
#include <stdio.h>
#include <string.h>

static FILE *f;
static unsigned curSection;

/* the current and next character, or EOF if unavailable */
static int c, nc = 0;
/* the line and column number */
static int lineno = 1, col = 0;

/* the lexer API: the current token as a global symbol and
 * three functions.
 */
Token token;

void NextToken();
void OpenFile(char *);
void CloseFile();

/* internal forward declarations */
static void next_char();
static int lookup_keyword(char *str);

void OpenFile(char *fn)
{
  f = fopen(fn, "r");
  if( !f ) hsc_abort("can't open file");
  curSection = TK_UNKNOWN;
  /* fill the pipeline */
  next_char();
  next_char();
  NextToken();
}

void CloseFile()
{
  fclose(f);
}

/* read the next character */
static void next_char()
{
  if( c=='\n' ) {
    lineno++;
    col = 0;
  } else {
    ++col;
  }
  c = nc;

  if( nc!=EOF) nc = fgetc(f);
  if( nc==EOF && ferror(f) ) hsc_abort("error reading input file");
  if( nc>='a' && nc<='z' ) nc += 'A' - 'a';

  if( c=='\r' && nc=='\n' ) {
    next_char();
  }
}

/* If a binary operator is followed by an '=', change the operator
 * into an assignment operator and fixup the token text.
 */
static int fixupAssignOp(int kw, int i)
{
  if( IsBinaryOp(kw) && c=='=' ) {
    kw |= TK_ASSIGNMENT;
    token.text[i++] = c;
    token.text[i] = 0; 
    next_char();
  }
  return kw;
}

/* return true if c is valid digit for a radix number and accept
 * '_' if sep is true.
 */
static int isDigit(int c, int radix, int sep)
{
  int ubound = '0' - 1 + radix;
  if( c=='_') return sep;
  if( radix<=10 ) return (c>='0' &&c<=ubound);
  return ( (c>='0' && c<='9') || (c>='A' && c<='F') || (c>='a' && c<='f') );
}

/* return true if c is alphabetic */
static int isAlpha(int c)
{
  return ( (c>='a' && c<='z') || ((c>='A') && c<='Z') || c=='_');
}

/* return numeric value of digit c; returns garbage is c is not a digit */
static int digitValue(int c)
{
  if( c<='9' ) return (c-'0');
  if( c<='F' ) return (c-'A');
  if( c<='f' ) return (c-'a');
  return (-1); /* silence compiler */
}

static int skipWhite()
{
  while(1) {
    switch(c) {
      case ' ': next_char(); break;
      case '\t': hsc_abort("illegal TAB character in input");

      case EOF: return 0;

      /* handle { and {{ comments */
      case '{' :
        if( nc=='{' ) {
          do {
            if( c==EOF ) break;
            next_char();
          } while( c!='}' || nc!='}' );
          if( c==EOF ) hsc_abort("unterminated comment");
          next_char();
          next_char();
        }
        else {
          do {
            if( c==EOF ) break;
            next_char();
          } while( c!='}' );
          if( c==EOF ) hsc_abort("unterminated comment");
          next_char();          
        }
        break;
      
      /* handle ' and '' comments */  
      case '\'':
        do {
          next_char();
        } while( c!='\n' && c!=TK_EOF );
        return (c!=TK_EOF);

      default: return 1;
    }
  }
}

#define NORMAL       0
#define STR_INT      1
#define STR_COMMA    2

void NextToken()
{
  static int state;
  int kw;
  
  token.line = lineno;
  token.col = col;
  token.value = 0;
  token.text[0] = 0;
  
  /* Strings are tokenized as a sequence of integers separated by commas. For
   * example, "abc" is tokenized as '0x61' ',' '0x62' ',' and '0x63'. Note there
   * is no terminating zero. Empty strings are invalid.
   */
  switch(state) {
    case STR_INT:
      token.type = TK_INTLITERAL;
      if( c=='\n' ) hsc_abort("unterminated string");
      token.value = c;
      next_char();
      state = STR_COMMA;
      return;
    case STR_COMMA:
      if( c=='"' ) {
        next_char();
        state = NORMAL;
        break;
      }
      token.type = TK_COMMA;
      state = STR_INT;
      return;
  }
  
  /* move to next non-white character, return EOF if none */
  if( !skipWhite() ) {
    token.type = TK_EOF;
    token.line = lineno;
    token.col = col;
    strcpy(token.text, "(EOF)");
    return;
  }

  token.line = lineno;
  token.col = col;

  /* if at end-of-line return EOL */  
  if( c=='\n' ) {
    token.type = TK_EOL;
    strcpy(token.text, "(EOL)");
    next_char();
    return;    
  }
  
  /* tokenize a string into a int & comma sequence */
  if( c=='"' ) {
    if( nc=='"' ) hsc_abort("empty string");
    state = STR_INT;
    next_char();
    NextToken();
    return;
  }
  
  /* handle a hex, binary or quad base number */
  if( c=='%' || c=='$' ) {
    int radix = 16;
    token.type = TK_INTLITERAL;
    if( c=='%' ) {
      radix = 2;
      if( nc=='%' ) {
        next_char();
        radix = 4; 
      }
    }
    next_char();
    if( !isDigit(c, radix, 0) ) hsc_abort("bad character in number");
    token.value = 0;
    while( isDigit(c, radix, 1) ) {
      if( c!='_' ) token.value = token.value * radix + digitValue(c);
      next_char();
    }
    return;
  }

  /* handle a decimal number, integer or float. fp_state is 1 for integers, turns 2
   * when a . is seen, 3 when a E is seen and 4 when a + or - is seen. A zero state
   * indicates completion.
   */
  if( isDigit(c, 10, 0) ) {
    int i = 0, fp = 0, fp_state = 1;
    do {
      switch( c ) {
        case '.':
          fp_state = (nc!='.' && fp_state==1) ? 2 : 0;
          break;
        case 'e': case 'E':
          fp_state = (fp_state==2) ? 3 : 0;
          break;
        case '+': case '-':
          fp_state = (fp_state==3) ? 4 : 0;
          break;
        default:
          if( !isDigit(c, 10, 1) ) {
            fp_state = 0;
          } else {
            fp_state = (fp_state==3) ? 4 : fp_state;
          }
      }
      if( fp_state>1 ) fp = 1;
      if( i>=MAXTOKENLENGTH ) hsc_abort("number constant too long");
      if( c!='_' ) token.text[i++] = c;
      if( fp_state ) next_char();
    } while( fp_state>0 );
    token.text[--i] = 0;
    if( !fp ) {
      token.type = TK_INTLITERAL;
      token.value = strtoul(token.text, NULL, 10);
    }  
    else {
      float flt;
      token.type = TK_FLOATLITERAL;
      flt = strtod(token.text, NULL);
      token.value = *(long*)(&flt);
    }
    return;
  }
  
  /* handle an identifier or alphanumeric keyword */
  if( isAlpha(c) ) {
    int i = 0;
    while( isAlpha(c) || isDigit(c, 10, 1) ) {
      if( i>MAXTOKENLENGTH ) hsc_abort("identifier too long");
      token.text[i++] = c;
      next_char();
    }
    token.text[i] = 0;
    if( (kw = lookup_keyword(token.text)) ) {
      if( kw>=TK_CON && kw<=TK_VAR && token.col==0 ) curSection = kw;
      token.type = fixupAssignOp(kw, i);
      return;
    }
    token.type = TK_ID;
    return;
  }
  
  /* handle DAT section local labels */
  if( c==':' && curSection==TK_DAT && isAlpha(nc) ) {
    int i = 1;
    token.text[0] = ':';
    next_char();
    while( isAlpha(c) || isDigit(c, 10, 1) ) {
      if( i>MAXTOKENLENGTH ) hsc_abort("identifier too long");
      token.text[i++] = c;
      next_char();
    }
    token.text[i] = 0;
    token.type = TK_ID;
    return;
  }
  
  /* handle all other characters; we simply search for the longest
   * 'keyword' match. Note that all operator sequences are in the
   * keyword table. FIXME: use a better algorithm. */
  {
    int i = 0;
    token.text[i++] = c;
    token.text[i] = 0;
    while( (kw = lookup_keyword(token.text)) ) {
      token.text[i++] = nc;
      token.text[i] = 0;
      next_char();
    } 
    if( i==1 ) {
      token.type = TK_UNKNOWN;
      hsc_abort("unrecogized character/token");
    }
    token.text[--i] = 0;
    token.type = lookup_keyword(token.text);
    token.type = fixupAssignOp(token.type, i);
    return;
  }
  /* never reached */
}

/* This list must be sorted in ascending strcmp order */
static struct tokens {
  char     *label;
  unsigned  value;
} tk_table[] = {
    { "!",                TK_BANG },
    { "#",                TK_HASH },
    { "#>",               TK_HASHGT },
    { "$",                TK_DOLLAR },
    { "&",                TK_AMPERSAND },
    { "(",                TK_LPAREN },
    { ")",                TK_RPAREN },
    { "*",                TK_STAR },
    { "**",               TK_STARSTAR },
    { "+",                TK_PLUS },
    { "++",               TK_PLUSPLUS },
    { ",",                TK_COMMA },
    { "-",                TK_MINUS },
    { "--",               TK_MINUSMINUS },
    { "->",               TK_MINUSGT },
    { ".",                TK_DOT },
    { "..",               TK_DOTDOT },
    { "/",                TK_SLASH },
    { "//",               TK_SLASHSLASH },
    { ":",                TK_COLON },
    { ":=",               TK_COLONEQUAL },
    { "<",                TK_LT },
    { "<#",               TK_LTHASH },
    { "<-",               TK_LTMINUS },
    { "<<",               TK_LTLT },
    { "<>",               TK_LTGT },
    { "=",                TK_EQUAL },
    { "=<",               TK_EQUALLT },
    { "==",               TK_EQUALEQUAL },
    { "=>",               TK_EQUALGT },
    { ">",                TK_GT },
    { "><",               TK_GTLT },
    { ">>",               TK_GTGT },
    { ">|",               TK_GTBAR },
    { "?",                TK_QUESTION },
    { "@",                TK_AT },
    { "@@",               TK_ATAT },
    { "ABORT",            TK_ABORT },
    { "ABS",              TK_ABS },
    { "ABSNEG",           TK_ABSNEG },
    { "ADD",              TK_ADD },
    { "ADDABS",           TK_ADDABS },
    { "ADDS",             TK_ADDS },
    { "ADDSX",            TK_ADDSX },
    { "ADDX",             TK_ADDX },
    { "AND",              TK_AND },
    { "ANDN",             TK_ANDN },
    { "BYTE",             TK_BYTE },
    { "BYTEFILL",         TK_BYTEFILL },
    { "BYTEMOVE",         TK_BYTEMOVE },
    { "CALL",             TK_CALL },
    { "CASE",             TK_CASE },
    { "CHIPVER",          TK_INTRINSIC },
    { "CLKFREQ",          TK_CLKFREQ },
    { "CLKMODE",          TK_CLKMODE },
    { "CLKSET",           TK_CLKSET },
    { "CMP",              TK_CMP },
    { "CMPS",             TK_CMPS },
    { "CMPSUB",           TK_CMPSUB },
    { "CMPSX",            TK_CMPSX },
    { "CMPX",             TK_CMPX },
    { "CNT",              TK_CNT },
    { "COGID",            TK_COGID },
    { "COGINIT",          TK_COGINIT },
    { "COGNEW",           TK_COGNEW },
    { "COGSTOP",          TK_COGSTOP },
    { "CON",              TK_CON },
    { "CONSTANT",         TK_CONSTANT },
    { "CTRA",             TK_CTRA },
    { "CTRB",             TK_CTRB },
    { "DAT",              TK_DAT },
    { "DIRA",             TK_DIRA },
    { "DIRB",             TK_DIRB },
    { "DJNZ",             TK_DJNZ },
    { "ELSE",             TK_ELSE },
    { "ELSEIF",           TK_ELSEIF },
    { "ELSEIFNOT",        TK_ELSEIFNOT },
    { "FILE",             TK_FILE },
    { "FIT",              TK_FIT },
    { "FLOAT",            TK_FLOAT },
    { "FROM",             TK_FROM },
    { "FRQA",             TK_FRQA },
    { "FRQB",             TK_FRQB },
    { "HUBOP",            TK_HUBOP },
    { "IF",               TK_IF },
    { "IFNOT",            TK_IFNOT },
    { "IF_A",             TK_IF_A },
    { "IF_AE",            TK_IF_AE },
    { "IF_ALWAYS",        TK_IF_ALWAYS },
    { "IF_B",             TK_IF_B },
    { "IF_BE",            TK_IF_BE },
    { "IF_C",             TK_IF_B },
    { "IF_C_AND_NZ",      TK_IF_C_AND_NZ },
    { "IF_C_AND_Z",       TK_IF_C_AND_Z },
    { "IF_C_EQ_Z",        TK_IF_C_EQ_Z },
    { "IF_C_NE_Z",        TK_IF_C_NE_Z },
    { "IF_C_OR_NZ",       TK_IF_C_OR_NZ },
    { "IF_C_OR_Z",        TK_IF_BE },
    { "IF_E",             TK_IF_E },
    { "IF_NC",            TK_IF_AE },
    { "IF_NC_AND_NZ",     TK_IF_A },
    { "IF_NC_AND_Z",      TK_IF_NC_AND_Z },
    { "IF_NC_OR_NZ",      TK_IF_NC_OR_NZ },
    { "IF_NC_OR_Z",       TK_IF_NC_OR_Z },
    { "IF_NE",            TK_IF_NE },
    { "IF_NEVER",         TK_COND },
    { "IF_NZ",            TK_IF_NE },
    { "IF_NZ_AND_C",      TK_IF_C_AND_NZ },
    { "IF_NZ_AND_NC",     TK_IF_A },
    { "IF_NZ_OR_C",       TK_IF_BE },
    { "IF_NZ_OR_NC",      TK_IF_NC_OR_Z },
    { "IF_Z",             TK_IF_E },
    { "IF_Z_AND_C",       TK_IF_C_AND_Z },
    { "IF_Z_AND_NC",      TK_IF_NC_AND_Z },
    { "IF_Z_EQ_C",        TK_IF_C_EQ_Z },
    { "IF_Z_NE_C",        TK_IF_C_NE_Z },
    { "IF_Z_OR_C",        TK_IF_BE },
    { "IF_Z_OR_NC",       TK_IF_NC_OR_Z },
    { "INA",              TK_INA },
    { "INB",              TK_INB },
    { "JMP",              TK_JMP },
    { "JMPRET",           TK_JMPRET },
    { "LOCKCLR",          TK_LOCKCLR },
    { "LOCKNEW",          TK_LOCKNEW },
    { "LOCKRET",          TK_LOCKRET },
    { "LOCKSET",          TK_LOCKSET },
    { "LONG",             TK_LONG },
    { "LONGFILL",         TK_LONGFILL },
    { "LONGMOVE",         TK_LONGMOVE },
    { "LOOKDOWN",         TK_LOOKDOWN },
    { "LOOKDOWNZ",        TK_LOOKDOWNZ },
    { "LOOKUP",           TK_LOOKUP },
    { "LOOKUPZ",          TK_LOOKUPZ },
    { "MAX",              TK_MAX },
    { "MAXS",             TK_MAXS },
    { "MIN",              TK_MIN },
    { "MINS",             TK_MINS },
    { "MOV",              TK_MOV },
    { "MOVD",             TK_MOVD },
    { "MOVI",             TK_MOVI },
    { "MOVS",             TK_MOVS },
    { "MUXC",             TK_MUXC },
    { "MUXNC",            TK_MUXNC },
    { "MUXNZ",            TK_MUXNZ },
    { "MUXZ",             TK_MUXZ },
    { "NEG",              TK_NEG },
    { "NEGC",             TK_NEGC },
    { "NEGNC",            TK_NEGNC },
    { "NEGNZ",            TK_NEGNZ },
    { "NEGZ",             TK_NEGZ },
    { "NEXT",             TK_NEXT },
    { "NOP",              TK_PASM },
    { "NOT",              TK_NOT },
    { "NR",               TK_NR },
    { "OBJ",              TK_OBJ },
    { "OR",               TK_OR },
    { "ORG",              TK_ORG },
    { "ORGX",             TK_ORGX },
    { "OTHER",            TK_OTHER },
    { "OUTA",             TK_OUTA },
    { "OUTB",             TK_OUTB },
    { "PAR",              TK_PAR },
    { "PHSA",             TK_PHSA },
    { "PHSB",             TK_PHSB },
    { "PRI",              TK_PRI },
    { "PUB",              TK_PUB },
    { "QUIT",             TK_QUIT },
    { "RCL",              TK_RCL },
    { "RCR",              TK_RCR },
    { "RDBYTE",           TK_RDBYTE },
    { "RDLONG",           TK_RDLONG },
    { "RDWORD",           TK_RDWORD },
    { "REBOOT",           TK_REBOOT },
    { "REPEAT",           TK_REPEAT },
    { "RES",              TK_RES },
    { "RESULT",           TK_RESULT },
    { "RET",              TK_RET },
    { "RETURN",           TK_RETURN },
    { "REV",              TK_REV },
    { "ROL",              TK_ROL },
    { "ROR",              TK_ROR },
    { "ROUND",            TK_ROUND },
    { "SAR",              TK_SAR },
    { "SHL",              TK_SHL },
    { "SHR",              TK_SHR },
    { "SPR",              TK_SPR },
    { "STEP",             TK_STEP },
    { "STRCOMP",          TK_STRCOMP },
    { "STRING",           TK_STRING },
    { "STRSIZE",          TK_STRSIZE },
    { "SUB",              TK_SUB },
    { "SUBABS",           TK_SUBABS },
    { "SUBS",             TK_SUBS },
    { "SUBSX",            TK_SUBSX },
    { "SUBX",             TK_SUBX },
    { "SUMC",             TK_SUMC },
    { "SUMNC",            TK_SUMNC },
    { "SUMNZ",            TK_SUMNZ },
    { "SUMZ",             TK_SUMZ },
    { "TEST",             TK_TEST },
    { "TJNZ",             TK_TJNZ },
    { "TJZ",              TK_TJZ },
    { "TO",               TK_TO },
    { "TRUNC",            TK_TRUNC },
    { "UNTIL",            TK_UNTIL },
    { "VAR",              TK_VAR },
    { "VCFG",             TK_VCFG },
    { "VSCL",             TK_VSCL },
    { "WAITCNT",          TK_WAITCNT },
    { "WAITPEQ",          TK_WAITPEQ },
    { "WAITPNE",          TK_WAITPNE },
    { "WAITVID",          TK_WAITVID },
    { "WC",               TK_WC },
    { "WHILE",            TK_WHILE },
    { "WORD",             TK_WORD },
    { "WORDFILL",         TK_WORDFILL },
    { "WORDMOVE",         TK_WORDMOVE },
    { "WR",               TK_WR },
    { "WRBYTE",           TK_WRBYTE },
    { "WRLONG",           TK_WRLONG },
    { "WRWORD",           TK_WRWORD },
    { "WZ",               TK_WZ },
    { "XOR",              TK_XOR },
    { "[",                TK_LBRACKET },
    { "\\",                TK_BACKSLASH },
    { "]",                TK_RBRACKET },
    { "^",                TK_CARET },
    { "^^",               TK_CARETCARET },
    { "{",                TK_LBRACE },
    { "|",                TK_BAR },
    { "|<",               TK_BARLT },
    { "||",               TK_BARBAR },
    { "}",                TK_RBRACE },
    { "~",                TK_TILDE },
    { "~>",               TK_TILDEGT },
    { "~~",               TK_TILDETILDE },
    { "\177",             TK_UNKNOWN }
};

#define COUNT (sizeof(tk_table)/sizeof(tk_table[0]))

static int kw_bsearch(char *str, int low, int high)
{
  int mid, i;
  
  if( high<low ) return 0;
  mid = (low + high) / 2;
  i = strcmp( tk_table[mid].label, str);
  if( i>0 ) return kw_bsearch(str, low, mid-1);
  if( i<0 ) return kw_bsearch(str, mid+1, high);
  return tk_table[mid].value;
}

static int lookup_keyword(char *str)
{
  return kw_bsearch(str, 0, COUNT);
}
