lr/lex.c

252 lines
8.7 KiB
C

#include<stdbool.h>
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<ctype.h>
#include"lex.h"
#include"str.h"
#include"reg.h"
#include"err.h"
#include"state.h"
const char*lextype_names[]={"LNONE","LIDENTIFIER","LINTEGER","LFLOAT","LSTRING","LOPERATOR","LKEYWORD","LCOMMENT","LMINUS","LFAKE",NULL};
const char*lextype_colors[]={"\033[0m","\033[0m","\033[36m","\033[35m","\033[32m","\033[0m","\033[33m","\033[34m"};
const char*lexsubtype_names[]={"LENDSTATEMENT","LASSIGN","LLPAREN","LRPAREN","LLCBRACE","LRCBRACE","LSMINUS","LADD","LSMUL","LSDIV","LSNOT","LSCOLON","LSCOMMA","LSREF","LSDEREF",NULL};
static const char*operator_chars="-+*/=;(),.{}<>\\!:&";
static const char*keywords[]={"do","false","fn","for","if","let","ret","true","while","call","asm","ext",
/* type specifiers */
"i32",};
/* static char*operators[]={";","=","+=","-=","*=","/=","+","-","/","*","(",")","{","}"}; */
Lexer lex_new(void)
{
Lexer l={
.mode=LNONE,
.tokens=vec_new(sizeof(Tok)),
.escape_sequence=false,
};
return l;
}
void lex_free(Lexer*l)
{
if(!l)return;
for(size_t i=0;i<l->tokens.size;++i)
str_free(&vec_at(&l->tokens,i,Tok*)->str);
vec_free(&(l->tokens));
}
// Read string and store tokens
void lex_string(Lexer*lex,char*input_string)
{
//Reg regex=reg_new();
Str tmptokstr=str_new();
size_t current_line=1;
size_t input_string_len;
if(!lex||!input_string)return;
input_string_len=strlen(input_string);
// Read each individual byte
for(size_t i=0;i<input_string_len+1;++i)
{
/*****
* initmatch(chset,mode,keepch)
* - Match initial character and change lexer to corresponding mode,
* - clear tmptokstr and initialize new token
* chset char* set of characters which input_string[i] must match
* mode uint32_t change lexer mode to this
* keepch bool will we retain this character in the token string?
*****/
#define initmatch(chset,lmode,keepch) if(input_string[i]&&memchr((chset),input_string[i],strlen((chset)))){lex->mode=(lmode);if(keepch)--i;Tok _tmptok={.str=str_new(),.type=lex->mode,.line=current_line};vec_push(&lex->tokens,&_tmptok);str_clear(&tmptokstr);}
/*****
* modeterminate
* - Finalize current token lexing and set state to LNONE
* keepch bool will we retain this character in the token string?
*****/
#define modeterminate(keepch) do{lex->mode=LNONE;if(keepch)--i;str_assign(&(vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->str),tmptokstr.buffer);vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->line=current_line;str_append_n(&tmptokstr,input_string+i,2);}while(0)
/*****
* modematch(chset,logic,keepch)
* - Match lexeme characters following initial character,
* - set token type and return lexer mode to normal
* chset char* set of characters which input_string[i] must match
* logic bool if false, only modify current token when input_string[i] does NOT match chset
* keepch bool will we retain this character in the token string?
*****/
#define modematch(chset,logic,keepch) do{if(!input_string[i]||(logic==(!!memchr((chset),input_string[i],strlen(chset)))) ){modeterminate(keepch);}str_append_n(&tmptokstr,input_string+i,2);}while(0)
switch(lex->mode)
{
// New lexeme boundary
// Starting condition
/*****
* When we find a new lexeme,
* push a new Tok to the vector
* and set its type to the Lexer
* mode and initialize its str
*****/
case LNONE:
// Extra step: turn identifiers into keywords
if(lex->tokens.size>1)
{
Tok*lasttok=vec_at(&lex->tokens,lex->tokens.size-2,Tok*);
if(lasttok->type==LIDENTIFIER)
{
for(size_t j=0;j<sizeof(keywords)/sizeof(char*);++j)
if(strcmp(keywords[j],lasttok->str.buffer)==0)
lasttok->type=LKEYWORD;
}
}
/* !! FALL THROUGH !! */
default:
// We go past the input_string_len by one to
// make sure the fixup stage (above)
// always gets called
if(i>=input_string_len)break;
lex->escape_sequence=false;
/* if(lex->mode==LSTRING&&input_string[i]=='\\'){Tok _tmptok={.str=str_new(),.type=lex->mode,.line=current_line};vec_push(&lex->tokens,&_tmptok);++i;} else */
initmatch("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_",LIDENTIFIER,true) else
initmatch("\"",LSTRING,false) else
initmatch("0123456789",LINTEGER,true) else
initmatch("-",LMINUS,true) else
initmatch(operator_chars,LOPERATOR,true) else
initmatch("#",LCOMMENT,true) else
if(strchr(" \t\n",input_string[i])){if(input_string[i]=='\n')++current_line;continue;} else
err_log("%u: unrecognized character '%c' (%x)",current_line,((input_string[i]>32)?(input_string[i]):(' ')),input_string[i]);
//initmatch(" \t\n",LNONE,false)
break;
// Individual modes
case LIDENTIFIER:modematch("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789",false,true);break;
case LFLOAT:modematch("0123456789",false,true);break;
case LINTEGER:modematch("0123456789.",false,true);
if(input_string[i]=='.')
{
vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->type=LFLOAT;
lex->mode=LFLOAT;
}
break;
// Handle escape sequences in strings
case LSTRING:if(!lex->escape_sequence)
{
if(input_string[i]=='\\')
lex->escape_sequence=true;
else
modematch("\"",true,false);
}
else
{
switch(input_string[i])
{
case'n':input_string[i]='\n';break;
case'r':input_string[i]='\r';break;
case't':input_string[i]='\t';break;
case'"':input_string[i]='"';break;
/* case'\\':input_string[i]='\\';break; */
}
modematch(&input_string[i],false,false);
lex->escape_sequence=false;
}
break;
case LOPERATOR:modematch(operator_chars,false,true);
switch(input_string[i])
{
#define opmatch(lstype,keepch) do{vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->subtype=lstype;modeterminate(keepch);}while(0)
#define opmatch_nodup(lstype,keepch) do{lex->mode=LNONE;vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->line=current_line;str_append_n(&tmptokstr,input_string+i,2);}while(0)
case '+':opmatch(LADD,false);break;
case ';':opmatch(LENDSTATEMENT,false);break;
case '(':opmatch(LLPAREN,false);break;
case ')':opmatch(LRPAREN,false);break;
case '{':opmatch(LLCBRACE,false);break;
case '}':opmatch(LRCBRACE,false);break;
case '*':opmatch(LSMUL,false);break;
case '/':opmatch(LSDIV,false);break;
case '!':opmatch(LSNOT,false);break;
case ':':opmatch(LSCOLON,false);break;
case ',':opmatch(LSCOMMA,false);break;
case '&':opmatch(LSREF,false);break;
case '=':
{
opmatch(LASSIGN,false);
// TODO: allow combination of certain chars with '='
// to allow += or -= or *= etc.
/* char*ss=vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->str.buffer; */
/* if(!ss)break; */
/* printf("cmp('%s','%s')\n",vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->str.buffer,ss); */
/* if(strcmp("=",ss)==0)opmatch_nodup(0,false); */
/* else if(strcmp("+",ss)==0)opmatch_nodup(LASSIGN,false); */
/* else if(strcmp("-",ss)==0)opmatch_nodup(LASSIGN,false); */
/* else if(strcmp("*",ss)==0)opmatch_nodup(LASSIGN,false); */
/* else if(strcmp("/",ss)==0)opmatch_nodup(LASSIGN,false); */
}
break;
default:break;
#undef opmatch
#undef opmatch_nodup
}
break;
case LMINUS://modematch("0123456789=",true,true);
if(input_string[i]=='-')
{
vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->type=LOPERATOR;
vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->subtype=LSMINUS;
str_append_n(&tmptokstr,input_string+i,2);
modeterminate(false);
}
else// if(input_string[i]=='=')
{
vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->type=LOPERATOR;
lex->mode=LOPERATOR;
modeterminate(true);
}
break;
case LCOMMENT:modematch("\n",true,true);break;
}
#undef initmatch
#undef modematch
#undef modeterminate
}
str_free(&tmptokstr);
//reg_free(&regex);
}
void lex_print(Lexer*l)
{
printf("%p: (%lu/%lu) [",l,l->tokens.size,l->tokens.capacity);
for(size_t i=0;i<l->tokens.size;++i)
{
Tok*tok=vec_at(&l->tokens,i,Tok*);
printf("'%s'(%s %u)",
tok->str.buffer,
((tok->subtype)?(lexsubtype_names[tok->subtype-LENDSTATEMENT]):(lextype_names[tok->type])),
tok->type
);
if(i<l->tokens.size-1)
printf(", ");
}
printf("]\n");
}
/* size_t lex_strchrcount(char*str,char c) */
/* { */
/* size_t count=0; */
/* for(size_t i=0;str[i];++i) */
/* if(str[i]==c) */
/* ++count; */
/* return count; */
/* } */