lr/lex.c

#include<stdbool.h>
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<ctype.h>
#include"lex.h"
#include"str.h"
#include"reg.h"
#include"err.h"
#include"state.h"

const char*lextype_names[]={"LNONE","LIDENTIFIER","LINTEGER","LFLOAT","LSTRING","LOPERATOR","LKEYWORD","LCOMMENT","LMINUS","LFAKE",NULL};
const char*lextype_colors[]={"\033[0m","\033[0m","\033[36m","\033[35m","\033[32m","\033[0m","\033[33m","\033[34m"};
const char*lexsubtype_names[]={"LENDSTATEMENT","LASSIGN","LLPAREN","LRPAREN","LLCBRACE","LRCBRACE","LSMINUS","LADD","LSMUL","LSDIV","LSNOT","LSCOLON","LSCOMMA",NULL};
static const char*operator_chars="-+*/=;(),.{}<>\\!:";
static const char*keywords[]={"do","false","fn","for","if","let","ret","true","while","call","asm","ext",};
/* static char*operators[]={";","=","+=","-=","*=","/=","+","-","/","*","(",")","{","}"}; */

Lexer lex_new(void)
{
	Lexer l={
		.mode=LNONE,
		.tokens=vec_new(sizeof(Tok)),
		.escape_sequence=false,
	};

	return l;
}

void lex_free(Lexer*l)
{
	if(!l)return;
	for(size_t i=0;i<l->tokens.size;++i)
		str_free(&vec_at(&l->tokens,i,Tok*)->str);
	vec_free(&(l->tokens));
}

// Read string and store tokens
void lex_string(Lexer*lex,char*input_string)
{
	//Reg regex=reg_new();
	Str tmptokstr=str_new();
	size_t current_line=1;
	size_t input_string_len;

	if(!lex||!input_string)return;

	input_string_len=strlen(input_string);

	// Read each individual byte
	for(size_t i=0;i<input_string_len+1;++i)
	{

		/*****
		 * initmatch(chset,mode,keepch)
		 * - Match initial character and change lexer to corresponding mode,
		 * - clear tmptokstr and initialize new token
		 * chset        char*     set of characters which input_string[i] must match
		 * mode         uint32_t  change lexer mode to this
		 * keepch       bool      will we retain this character in the token string?
		 *****/
#define initmatch(chset,lmode,keepch) if(input_string[i]&&memchr((chset),input_string[i],strlen((chset)))){lex->mode=(lmode);if(keepch)--i;Tok _tmptok={.str=str_new(),.type=lex->mode,.line=current_line};vec_push(&lex->tokens,&_tmptok);str_clear(&tmptokstr);}

		/*****
		 * modeterminate
		 * - Finalize current token lexing and set state to LNONE
		 * keepch       bool      will we retain this character in the token string?
		 *****/
#define modeterminate(keepch) do{lex->mode=LNONE;if(keepch)--i;str_assign(&(vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->str),tmptokstr.buffer);vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->line=current_line;str_append_n(&tmptokstr,input_string+i,2);}while(0)

		/*****
		 * modematch(chset,logic,keepch)
		 * - Match lexeme characters following initial character,
		 * - set token type and return lexer mode to normal
		 * chset        char*     set of characters which input_string[i] must match
		 * logic        bool      if false, only modify current token when input_string[i] does NOT match chset
		 * keepch       bool      will we retain this character in the token string?
		 *****/
#define modematch(chset,logic,keepch) do{if(!input_string[i]||(logic==(!!memchr((chset),input_string[i],strlen(chset)))) ){modeterminate(keepch);}str_append_n(&tmptokstr,input_string+i,2);}while(0)

		switch(lex->mode)
		{

			// New lexeme boundary
			// Starting condition

			/*****
			 * When we find a new lexeme,
			 * push a new Tok to the vector
			 * and set its type to the Lexer
			 * mode and initialize its str
			 *****/

			case LNONE:
				// Extra step: turn identifiers into keywords
				if(lex->tokens.size>1)
				{
					Tok*lasttok=vec_at(&lex->tokens,lex->tokens.size-2,Tok*);
					if(lasttok->type==LIDENTIFIER)
					{
						for(size_t j=0;j<sizeof(keywords)/sizeof(char*);++j)
							if(strcmp(keywords[j],lasttok->str.buffer)==0)
								lasttok->type=LKEYWORD;
					}
				}
				/* !! FALL THROUGH !! */
			default:

				// We go past the input_string_len by one to
				// make sure the fixup stage (above)
				// always gets called
				if(i>=input_string_len)break;
				lex->escape_sequence=false;

				/* if(lex->mode==LSTRING&&input_string[i]=='\\'){Tok _tmptok={.str=str_new(),.type=lex->mode,.line=current_line};vec_push(&lex->tokens,&_tmptok);++i;} else */
				initmatch("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_",LIDENTIFIER,true) else
				initmatch("\"",LSTRING,false) else
				initmatch("0123456789",LINTEGER,true) else
				initmatch("-",LMINUS,true) else
				initmatch(operator_chars,LOPERATOR,true) else
				initmatch("#",LCOMMENT,true) else
				if(strchr(" \t\n",input_string[i])){if(input_string[i]=='\n')++current_line;continue;} else
				err_log("%u: unrecognized character '%c' (%x)",current_line,((input_string[i]>32)?(input_string[i]):(' ')),input_string[i]);
				//initmatch(" \t\n",LNONE,false)
				break;

			// Individual modes
			case LIDENTIFIER:modematch("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789",false,true);break;
			case LFLOAT:modematch("0123456789",false,true);break;
			case LINTEGER:modematch("0123456789.",false,true);
						 if(input_string[i]=='.')
						 {
							 vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->type=LFLOAT;
							 lex->mode=LFLOAT;
						 }
						 break;
			// Handle escape sequences in strings
			case LSTRING:if(!lex->escape_sequence)
						 {
							 if(input_string[i]=='\\')
								 lex->escape_sequence=true;
							 else
								 modematch("\"",true,false);
						 }
						 else
						 {
							 switch(input_string[i])
							 {
								 case'n':input_string[i]='\n';break;
								 case'r':input_string[i]='\r';break;
								 case't':input_string[i]='\t';break;
								 case'"':input_string[i]='"';break;
								 /* case'\\':input_string[i]='\\';break; */
							 }
							 modematch(&input_string[i],false,false);
							 lex->escape_sequence=false;
						 }
						 break;
			case LOPERATOR:modematch(operator_chars,false,true);
						   switch(input_string[i])
						   {
#define opmatch(lstype,keepch) do{vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->subtype=lstype;modeterminate(keepch);}while(0)
#define opmatch_nodup(lstype,keepch) do{lex->mode=LNONE;vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->line=current_line;str_append_n(&tmptokstr,input_string+i,2);}while(0)
							   case '+':opmatch(LADD,false);break;
							   case ';':opmatch(LENDSTATEMENT,false);break;
							   case '(':opmatch(LLPAREN,false);break;
							   case ')':opmatch(LRPAREN,false);break;
							   case '{':opmatch(LLCBRACE,false);break;
							   case '}':opmatch(LRCBRACE,false);break;
							   case '*':opmatch(LSMUL,false);break;
							   case '/':opmatch(LSDIV,false);break;
							   case '!':opmatch(LSNOT,false);break;
							   case ':':opmatch(LSCOLON,false);break;
							   case ',':opmatch(LSCOMMA,false);break;
							   case '=':
										{
											opmatch(LASSIGN,false);
											// TODO: allow combination of certain chars with '='
											// to allow += or -= or *= etc.

											/* char*ss=vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->str.buffer; */
											/* if(!ss)break; */
											/* printf("cmp('%s','%s')\n",vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->str.buffer,ss); */
											/* if(strcmp("=",ss)==0)opmatch_nodup(0,false); */
											/* else if(strcmp("+",ss)==0)opmatch_nodup(LASSIGN,false); */
											/* else if(strcmp("-",ss)==0)opmatch_nodup(LASSIGN,false); */
											/* else if(strcmp("*",ss)==0)opmatch_nodup(LASSIGN,false); */
											/* else if(strcmp("/",ss)==0)opmatch_nodup(LASSIGN,false); */
										}
										break;
							   default:break;
#undef opmatch
#undef opmatch_nodup
						   }
						   break;
			case LMINUS://modematch("0123456789=",true,true);
						if(input_string[i]=='-')
						{
							vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->type=LOPERATOR;
							vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->subtype=LSMINUS;
							str_append_n(&tmptokstr,input_string+i,2);
							modeterminate(false);
						}
						else// if(input_string[i]=='=')
						{
							vec_at(&lex->tokens,lex->tokens.size-1,Tok*)->type=LOPERATOR;
							lex->mode=LOPERATOR;
							modeterminate(true);
						}
						break;
			case LCOMMENT:modematch("\n",true,true);break;

		}
#undef initmatch
#undef modematch
#undef modeterminate
	}

	str_free(&tmptokstr);
	//reg_free(&regex);
}

void lex_print(Lexer*l)
{
	printf("%p: (%lu/%lu) [",l,l->tokens.size,l->tokens.capacity);
	for(size_t i=0;i<l->tokens.size;++i)
	{
		Tok*tok=vec_at(&l->tokens,i,Tok*);
		printf("'%s'(%s %u)",
				tok->str.buffer,
				((tok->subtype)?(lexsubtype_names[tok->subtype-LENDSTATEMENT]):(lextype_names[tok->type])),
				tok->type
				);
		if(i<l->tokens.size-1)
			printf(", ");
	}
	printf("]\n");
}

/* size_t lex_strchrcount(char*str,char c) */
/* { */
/* 	size_t count=0; */
/* 	for(size_t i=0;str[i];++i) */
/* 		if(str[i]==c) */
/* 			++count; */
/* 	return count; */
/* } */