249 lines
8.6 KiB
C
249 lines
8.6 KiB
C
#include<stdbool.h>
|
|
#include<stdio.h>
|
|
#include<stdlib.h>
|
|
#include<string.h>
|
|
#include<unistd.h>
|
|
#include<ctype.h>
|
|
#include"lex.h"
|
|
#include"str.h"
|
|
#include"reg.h"
|
|
#include"err.h"
|
|
#include"state.h"
|
|
|
|
const char*lextype_names[]={"LNONE","LIDENTIFIER","LINTEGER","LFLOAT","LSTRING","LOPERATOR","LKEYWORD","LCOMMENT","LMINUS","LFAKE",NULL};
|
|
const char*lextype_colors[]={"\033[0m","\033[0m","\033[36m","\033[35m","\033[32m","\033[0m","\033[33m","\033[34m"};
|
|
const char*lexsubtype_names[]={"LENDSTATEMENT","LASSIGN","LLPAREN","LRPAREN","LLCBRACE","LRCBRACE","LSMINUS","LADD","LSMUL","LSDIV","LSNOT","LSCOLON","LSCOMMA",NULL};
|
|
static const char*operator_chars="-+*/=;(),.{}<>\\!:";
|
|
static const char*keywords[]={"do","false","fn","for","if","let","ret","true","while","call","asm","ext",};
|
|
/* static char*operators[]={";","=","+=","-=","*=","/=","+","-","/","*","(",")","{","}"}; */
|
|
|
|
Lexer lex_new(void)
|
|
{
|
|
Lexer l={
|
|
.mode=LNONE,
|
|
.tokens=vec_new(sizeof(Tok)),
|
|
.escape_sequence=false,
|
|
};
|
|
|
|
return l;
|
|
}
|
|
|
|
void lex_free(Lexer*l)
|
|
{
|
|
if(!l)return;
|
|
for(size_t i=0;i<l->tokens.size;++i)
|
|
str_free(&vec_at(&l->tokens,i,Tok*)->str);
|
|
vec_free(&(l->tokens));
|
|
}
|
|
|
|
// Read string and store tokens
|
|
void lex_string(Lexer*lex,char*input_string)
|
|
{
|
|
//Reg regex=reg_new();
|
|
Str tmptokstr=str_new();
|
|
size_t current_line=1;
|
|
size_t input_string_len;
|
|
|
|
if(!lex||!input_string)return;
|
|
|
|
input_string_len=strlen(input_string);
|
|
|
|
// Read each individual byte
|
|
for(size_t i=0;i<input_string_len+1;++i)
|
|
{
|
|
|
|
/*****
|
|
* initmatch(chset,mode,keepch)
|
|
* - Match initial character and change lexer to corresponding mode,
|
|
* - clear tmptokstr and initialize new token
|
|
* chset char* set of characters which input_string[i] must match
|
|
* mode uint32_t change lexer mode to this
|
|
* keepch bool will we retain this character in the token string?
|
|
*****/
|
|
#define initmatch(chset,lmode,keepch) if(input_string[i]&&memchr((chset),input_string[i],strlen((chset)))){lex->mode=(lmode);if(keepch)--i;Tok _tmptok={.str=str_new(),.type=lex->mode,.line=current_line};vec_push(&lex->tokens,&_tmptok);str_clear(&tmptokstr);}
|
|
|
|
/*****
|
|
* modeterminate
|
|
* - Finalize current token lexing and set state to LNONE
|
|
* keepch bool will we retain this character in the token string?
|
|
*****/
|
|
#define modeterminate(keepch) do{lex->mode=LNONE;if(keepch)--i;str_assign(&(vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->str),tmptokstr.buffer);vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->line=current_line;str_append_n(&tmptokstr,input_string+i,2);}while(0)
|
|
|
|
/*****
|
|
* modematch(chset,logic,keepch)
|
|
* - Match lexeme characters following initial character,
|
|
* - set token type and return lexer mode to normal
|
|
* chset char* set of characters which input_string[i] must match
|
|
* logic bool if false, only modify current token when input_string[i] does NOT match chset
|
|
* keepch bool will we retain this character in the token string?
|
|
*****/
|
|
#define modematch(chset,logic,keepch) do{if(!input_string[i]||(logic==(!!memchr((chset),input_string[i],strlen(chset)))) ){modeterminate(keepch);}str_append_n(&tmptokstr,input_string+i,2);}while(0)
|
|
|
|
switch(lex->mode)
|
|
{
|
|
|
|
// New lexeme boundary
|
|
// Starting condition
|
|
|
|
/*****
|
|
* When we find a new lexeme,
|
|
* push a new Tok to the vector
|
|
* and set its type to the Lexer
|
|
* mode and initialize its str
|
|
*****/
|
|
|
|
case LNONE:
|
|
// Extra step: turn identifiers into keywords
|
|
if(lex->tokens.size>1)
|
|
{
|
|
Tok*lasttok=vec_at(&lex->tokens,lex->tokens.size-2,Tok*);
|
|
if(lasttok->type==LIDENTIFIER)
|
|
{
|
|
for(size_t j=0;j<sizeof(keywords)/sizeof(char*);++j)
|
|
if(strcmp(keywords[j],lasttok->str.buffer)==0)
|
|
lasttok->type=LKEYWORD;
|
|
}
|
|
}
|
|
/* !! FALL THROUGH !! */
|
|
default:
|
|
|
|
// We go past the input_string_len by one to
|
|
// make sure the fixup stage (above)
|
|
// always gets called
|
|
if(i>=input_string_len)break;
|
|
lex->escape_sequence=false;
|
|
|
|
/* if(lex->mode==LSTRING&&input_string[i]=='\\'){Tok _tmptok={.str=str_new(),.type=lex->mode,.line=current_line};vec_push(&lex->tokens,&_tmptok);++i;} else */
|
|
initmatch("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_",LIDENTIFIER,true) else
|
|
initmatch("\"",LSTRING,false) else
|
|
initmatch("0123456789",LINTEGER,true) else
|
|
initmatch("-",LMINUS,true) else
|
|
initmatch(operator_chars,LOPERATOR,true) else
|
|
initmatch("#",LCOMMENT,true) else
|
|
if(strchr(" \t\n",input_string[i])){if(input_string[i]=='\n')++current_line;continue;} else
|
|
err_log("%u: unrecognized character '%c' (%x)",current_line,((input_string[i]>32)?(input_string[i]):(' ')),input_string[i]);
|
|
//initmatch(" \t\n",LNONE,false)
|
|
break;
|
|
|
|
// Individual modes
|
|
case LIDENTIFIER:modematch("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789",false,true);break;
|
|
case LFLOAT:modematch("0123456789",false,true);break;
|
|
case LINTEGER:modematch("0123456789.",false,true);
|
|
if(input_string[i]=='.')
|
|
{
|
|
vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->type=LFLOAT;
|
|
lex->mode=LFLOAT;
|
|
}
|
|
break;
|
|
// Handle escape sequences in strings
|
|
case LSTRING:if(!lex->escape_sequence)
|
|
{
|
|
if(input_string[i]=='\\')
|
|
lex->escape_sequence=true;
|
|
else
|
|
modematch("\"",true,false);
|
|
}
|
|
else
|
|
{
|
|
switch(input_string[i])
|
|
{
|
|
case'n':input_string[i]='\n';break;
|
|
case'r':input_string[i]='\r';break;
|
|
case't':input_string[i]='\t';break;
|
|
case'"':input_string[i]='"';break;
|
|
/* case'\\':input_string[i]='\\';break; */
|
|
}
|
|
modematch(&input_string[i],false,false);
|
|
lex->escape_sequence=false;
|
|
}
|
|
break;
|
|
case LOPERATOR:modematch(operator_chars,false,true);
|
|
switch(input_string[i])
|
|
{
|
|
#define opmatch(lstype,keepch) do{vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->subtype=lstype;modeterminate(keepch);}while(0)
|
|
#define opmatch_nodup(lstype,keepch) do{lex->mode=LNONE;vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->line=current_line;str_append_n(&tmptokstr,input_string+i,2);}while(0)
|
|
case '+':opmatch(LADD,false);break;
|
|
case ';':opmatch(LENDSTATEMENT,false);break;
|
|
case '(':opmatch(LLPAREN,false);break;
|
|
case ')':opmatch(LRPAREN,false);break;
|
|
case '{':opmatch(LLCBRACE,false);break;
|
|
case '}':opmatch(LRCBRACE,false);break;
|
|
case '*':opmatch(LSMUL,false);break;
|
|
case '/':opmatch(LSDIV,false);break;
|
|
case '!':opmatch(LSNOT,false);break;
|
|
case ':':opmatch(LSCOLON,false);break;
|
|
case ',':opmatch(LSCOMMA,false);break;
|
|
case '=':
|
|
{
|
|
opmatch(LASSIGN,false);
|
|
// TODO: allow combination of certain chars with '='
|
|
// to allow += or -= or *= etc.
|
|
|
|
/* char*ss=vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->str.buffer; */
|
|
/* if(!ss)break; */
|
|
/* printf("cmp('%s','%s')\n",vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->str.buffer,ss); */
|
|
/* if(strcmp("=",ss)==0)opmatch_nodup(0,false); */
|
|
/* else if(strcmp("+",ss)==0)opmatch_nodup(LASSIGN,false); */
|
|
/* else if(strcmp("-",ss)==0)opmatch_nodup(LASSIGN,false); */
|
|
/* else if(strcmp("*",ss)==0)opmatch_nodup(LASSIGN,false); */
|
|
/* else if(strcmp("/",ss)==0)opmatch_nodup(LASSIGN,false); */
|
|
}
|
|
break;
|
|
default:break;
|
|
#undef opmatch
|
|
#undef opmatch_nodup
|
|
}
|
|
break;
|
|
case LMINUS://modematch("0123456789=",true,true);
|
|
if(input_string[i]=='-')
|
|
{
|
|
vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->type=LOPERATOR;
|
|
vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->subtype=LSMINUS;
|
|
str_append_n(&tmptokstr,input_string+i,2);
|
|
modeterminate(false);
|
|
}
|
|
else// if(input_string[i]=='=')
|
|
{
|
|
vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->type=LOPERATOR;
|
|
lex->mode=LOPERATOR;
|
|
modeterminate(true);
|
|
}
|
|
break;
|
|
case LCOMMENT:modematch("\n",true,true);break;
|
|
|
|
}
|
|
#undef initmatch
|
|
#undef modematch
|
|
#undef modeterminate
|
|
}
|
|
|
|
str_free(&tmptokstr);
|
|
//reg_free(®ex);
|
|
}
|
|
|
|
void lex_print(Lexer*l)
|
|
{
|
|
printf("%p: (%lu/%lu) [",l,l->tokens.size,l->tokens.capacity);
|
|
for(size_t i=0;i<l->tokens.size;++i)
|
|
{
|
|
Tok*tok=vec_at(&l->tokens,i,Tok*);
|
|
printf("'%s'(%s %u)",
|
|
tok->str.buffer,
|
|
((tok->subtype)?(lexsubtype_names[tok->subtype-LENDSTATEMENT]):(lextype_names[tok->type])),
|
|
tok->type
|
|
);
|
|
if(i<l->tokens.size-1)
|
|
printf(", ");
|
|
}
|
|
printf("]\n");
|
|
}
|
|
|
|
/* size_t lex_strchrcount(char*str,char c) */
|
|
/* { */
|
|
/* size_t count=0; */
|
|
/* for(size_t i=0;str[i];++i) */
|
|
/* if(str[i]==c) */
|
|
/* ++count; */
|
|
/* return count; */
|
|
/* } */
|