445 lines
12 KiB
C++
445 lines
12 KiB
C++
#ifdef IGNORE_THIS
|
|
|
|
/*
|
|
PARSER.C Lloyd Zusman, Master Byte Software, Trump User Group
|
|
(408) 395-5693 (voice only)
|
|
|
|
This program is a generalized, finite state token parser. It's
|
|
it is the most powerful parser I've seen on any BBS (if I do say
|
|
so myself). It allows you extract tokens one at a time from a
|
|
string of characters. The characters used for white space, for
|
|
break characters, and for quotes can be specified. Also,
|
|
characters in the string can be preceded by a specifiable escape
|
|
character which removes any special meaning the character may have.
|
|
|
|
There are a lot of formal parameters in this subroutine call, but
|
|
once you get familiar with them, this routine is fairly easy to use.
|
|
"#define" macros can be used to generate simpler looking calls for
|
|
commonly used applications of this routine.
|
|
|
|
First, some terminology:
|
|
|
|
token used here, a single unit of information in
|
|
the form of a group of characters.
|
|
|
|
white space space that gets ignored (except within quotes
|
|
or when escaped), like blanks and tabs. in
|
|
addition, white space terminates a non-quoted
|
|
token.
|
|
|
|
break character a character that separates non-quoted tokens.
|
|
commas are a common break character. the
|
|
usage of break characters to signal the end
|
|
of a token is the same as that of white space,
|
|
except multiple break characters with nothing
|
|
or only white space between generate a null
|
|
token for each two break characters together.
|
|
|
|
for example, if blank is set to be the white
|
|
space and comma is set to be the break
|
|
character, the line ...
|
|
|
|
A, B, C , , DEF
|
|
|
|
... consists of 5 tokens:
|
|
|
|
1) "A"
|
|
2) "B"
|
|
3) "C"
|
|
4) "" (the null string)
|
|
5) "DEF"
|
|
|
|
quote character a character that, when surrounding a group
|
|
of other characters, causes the group of
|
|
characters to be treated as a single token,
|
|
no matter how many white spaces or break
|
|
characters exist in the group. also, a
|
|
token always terminates after the closing
|
|
quote. for example, if ' is the quote
|
|
character, blank is white space, and comma
|
|
is the break character, the following
|
|
string ...
|
|
|
|
A, ' B, CD'EF GHI
|
|
|
|
... consists of 4 tokens:
|
|
|
|
1) "A"
|
|
2) " B, CD" (note the blanks & comma)
|
|
3) "EF"
|
|
4) "GHI"
|
|
|
|
the quote characters themselves do
|
|
not appear in the resultant tokens. the
|
|
double quotes are delimiters i use here for
|
|
documentation purposes only.
|
|
|
|
escape character a character which itself is ignored but
|
|
which causes the next character to be
|
|
used as is. ^ and \ are often used as
|
|
escape characters. an escape in the last
|
|
position of the string gets treated as a
|
|
"normal" (i.e., non-quote, non-white,
|
|
non-break, and non-escape) character.
|
|
for example, assume white space, break
|
|
character, and quote are the same as in the
|
|
above examples, and further, assume that
|
|
^ is the escape character. then, in the
|
|
string ...
|
|
|
|
ABC, ' DEF ^' GH' I ^ J K^ L ^
|
|
|
|
... there are 7 tokens:
|
|
|
|
1) "ABC"
|
|
2) " DEF ' GH"
|
|
3) "I"
|
|
4) " " (a lone blank)
|
|
5) "J"
|
|
6) "K L"
|
|
7) "^" (passed as is at end of line)
|
|
|
|
|
|
OK, now that you have this background, here's how to call "parser":
|
|
|
|
result=parser(flag,token,maxtok,string,white,break,quote,escape,
|
|
brkused,next,quoted)
|
|
|
|
result: 0 if we haven't reached EOS (end of string), and
|
|
1 if we have (this is an "int").
|
|
|
|
flag: right now, only the low order 3 bits are used.
|
|
1 => convert non-quoted tokens to upper case
|
|
2 => convert non-quoted tokens to lower case
|
|
0 => do not convert non-quoted tokens
|
|
(this is a "char").
|
|
|
|
token: a character string containing the returned next token
|
|
(this is a "char[]").
|
|
|
|
maxtok: the maximum size of "token". characters beyond
|
|
"maxtok" are truncated (this is an "int").
|
|
|
|
string: the string to be parsed (this is a "char[]").
|
|
|
|
white: a string of the valid white spaces. example:
|
|
|
|
char whitesp[]={" \t"};
|
|
|
|
blank and tab will be valid white space (this is
|
|
a "char[]").
|
|
|
|
break: a string of the valid break characters. example:
|
|
|
|
char breakch[]={";,"};
|
|
|
|
semicolon and comma will be valid break characters
|
|
(this is a "char[]").
|
|
|
|
IMPORTANT: do not use the name "break" as a C
|
|
variable, as this is a reserved word in C.
|
|
|
|
quote: a string of the valid quote characters. an example
|
|
would be
|
|
|
|
char whitesp[]={"'\"");
|
|
|
|
(this causes single and double quotes to be valid)
|
|
note that a token starting with one of these characters
|
|
needs the same quote character to terminate it.
|
|
|
|
for example,
|
|
|
|
"ABC '
|
|
|
|
is unterminated, but
|
|
|
|
"DEF" and 'GHI'
|
|
|
|
are properly terminated. note that different quote
|
|
characters can appear on the same line; only for
|
|
a given token do the quote characters have to be
|
|
the same (this is a "char[]").
|
|
|
|
escape: the escape character (NOT a string ... only one
|
|
allowed). use zero if none is desired (this is
|
|
a "char").
|
|
|
|
brkused: the break character used to terminate the current
|
|
token. if the token was quoted, this will be the
|
|
quote used. if the token is the last one on the
|
|
line, this will be zero (this is a pointer to a
|
|
"char").
|
|
|
|
next: this variable points to the first character of the
|
|
next token. it gets reset by "parser" as it steps
|
|
through the string. set it to 0 upon initialization,
|
|
and leave it alone after that. you can change it
|
|
if you want to jump around in the string or re-parse
|
|
from the beginning, but be careful (this is a
|
|
pointer to an "int").
|
|
|
|
quoted: set to 1 (true) if the token was quoted and 0 (false)
|
|
if not. you may need this information (for example:
|
|
in C, a string with quotes around it is a character
|
|
string, while one without is an identifier).
|
|
|
|
(this is a pointer to a "char").
|
|
|
|
Example 1:
|
|
|
|
char *whitesp[]={" \t"); // blank and tab
|
|
char *breakch[]={",\r"); // comma and carriage return
|
|
char *quotech[]={"'\""}; // single and double quote
|
|
char escape='^'; // "uparrow" is escape
|
|
|
|
main()
|
|
{
|
|
char *fgets(),line[81],brkused,quoted,token[81];
|
|
int i,next;
|
|
|
|
while(fgets(line,80,stdin)!=NULL) // get line
|
|
{
|
|
|
|
printf("Line: %s",line); // already has <CR>
|
|
i=0;
|
|
|
|
next=0; // make sure you do this
|
|
|
|
while(parser(2,token,80,line,whitesp,breakch,quotech,escape,
|
|
&brkused,&next,"ed)==0)
|
|
{
|
|
printf(" Token %d = (%s)\n",++i,token);
|
|
|
|
if(brkchar=='\r') // <CR> is a break so it won't be included
|
|
break; // in the token. treat as end-of-line here
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
In the above example, lines are read from stdin and broken up into
|
|
tokens. All non-quoted tokens are converted to lower case. Since
|
|
fgets() returns the final carriage return, we treat it as a break
|
|
character to keep it out of the returned token. Also, since the only
|
|
way "parser" will return a non-zero error code is at end of line,
|
|
we test "brkchar" to see if we've gotten to the final carriage
|
|
return, and we explicitly break out of the inner loop if we've
|
|
hit it. Note that since fgets() puts the final <CR> right before
|
|
the end-of-string, if we left out the "if(brkchar='\r')" test,
|
|
we'd get one extra null token (just as if the line ended with a
|
|
single comma). Run this example to see how it all works.
|
|
|
|
Example 2:
|
|
|
|
.
|
|
.
|
|
.
|
|
|
|
next=0;
|
|
result=parser(1, newstr, 80, str, "", "", "", 0, &brkused, &next,
|
|
"ed);
|
|
.
|
|
.
|
|
.
|
|
|
|
this call takes whatever is in "str" and converts it to upper case,
|
|
putting the result in "newstr".
|
|
|
|
*** end of examples ***
|
|
|
|
in case you're interested, "parser.c" was inspired by a system
|
|
subroutine that comes as part of the PRIMOS operating system for
|
|
the Prime Computer: "gt$par.plp". i loosely patterned this routine
|
|
after the Prime routine.
|
|
|
|
Revisions:
|
|
|
|
09/30/84 Lloyd Zusman Initial coding
|
|
|
|
*/
|
|
|
|
#endif
|
|
|
|
#include "stdafx.h"
|
|
#include <ctype.h>
|
|
#include "parser.h"
|
|
|
|
|
|
////////////////////////////////////////////////////////////////
|
|
// routine to find character in string
|
|
int sindex(char ch,char *string)
|
|
{
|
|
char *cp;
|
|
for(cp=string;*cp;++cp)
|
|
if(ch==*cp)
|
|
return (int)(cp-string); // return postion of character
|
|
return -1; // eol ... no match found
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////
|
|
// routine to store a character in a string
|
|
void chstore(char *string,int max,char ch, int *pos, int state,
|
|
int flag)
|
|
{
|
|
char c;
|
|
if(*pos >= 0 && *pos < max-1)
|
|
{
|
|
if(state==IN_QUOTE)
|
|
c=ch;
|
|
else
|
|
switch(flag)
|
|
{
|
|
case 1: /* convert to upper */
|
|
c=toupper(ch);
|
|
break;
|
|
|
|
case 2: /* convert to lower */
|
|
c=tolower(ch);
|
|
break;
|
|
|
|
default: /* use as is */
|
|
c=ch;
|
|
break;
|
|
}
|
|
string[(*pos)++]=c;
|
|
}
|
|
return;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////
|
|
// parse a given string for a token
|
|
int parser(int inflag,char *token,int tokmax,char *line,char *white,
|
|
char *brkchar,char *quote,char eschar,char *brkused,int *next,
|
|
char *quoted)
|
|
{
|
|
int _p_tokpos; // current token pos
|
|
int _p_state; // current state
|
|
int _p_flag; // option flag
|
|
char _p_curquote; // current quote char
|
|
int qp;
|
|
char c,nc;
|
|
|
|
*brkused=0; /* initialize to null */
|
|
*quoted=0; /* assume not quoted */
|
|
|
|
if(!line[*next]) /* if we're at end of line, indicate such */
|
|
return 1;
|
|
|
|
_p_state=IN_WHITE; /* initialize state */
|
|
_p_curquote=0; /* initialize previous quote char */
|
|
_p_flag=inflag; /* set option flag */
|
|
|
|
for(_p_tokpos=0;c=line[*next];++(*next)) /* main loop */
|
|
{
|
|
if((qp=sindex(c,brkchar))>=0) /* break */
|
|
{
|
|
switch(_p_state)
|
|
{
|
|
case IN_WHITE: /* these are the same here ... */
|
|
case IN_TOKEN: /* ... just get out */
|
|
case IN_OZONE: /* ditto */
|
|
++(*next);
|
|
*brkused=brkchar[qp];
|
|
goto byebye;
|
|
|
|
case IN_QUOTE: /* just keep going */
|
|
chstore(token,tokmax,c,&_p_tokpos,_p_state,_p_flag);
|
|
break;
|
|
}
|
|
}
|
|
else if((qp=sindex(c,quote))>=0) /* quote */
|
|
{
|
|
switch(_p_state)
|
|
{
|
|
case IN_WHITE: /* these are identical, */
|
|
_p_state=IN_QUOTE; /* change states */
|
|
_p_curquote=quote[qp]; /* save quote char */
|
|
*quoted=1; /* set to true as long as something is in quotes */
|
|
break;
|
|
|
|
case IN_QUOTE:
|
|
if(quote[qp]==_p_curquote) /* same as the beginning quote? */
|
|
{
|
|
_p_state=IN_OZONE;
|
|
_p_curquote=0;
|
|
}
|
|
else
|
|
chstore(token,tokmax,c,&_p_tokpos,_p_state,_p_flag); /* treat as regular char */
|
|
break;
|
|
|
|
case IN_TOKEN:
|
|
case IN_OZONE:
|
|
*brkused=c; /* uses quote as break char */
|
|
goto byebye;
|
|
}
|
|
}
|
|
else if((qp=sindex(c,white))>=0) /* white */
|
|
{
|
|
switch(_p_state)
|
|
{
|
|
case IN_WHITE:
|
|
case IN_OZONE:
|
|
break; /* keep going */
|
|
|
|
case IN_TOKEN:
|
|
_p_state=IN_OZONE;
|
|
break;
|
|
|
|
case IN_QUOTE:
|
|
chstore(token,tokmax,c,&_p_tokpos,_p_state,_p_flag); /* it's valid here */
|
|
break;
|
|
}
|
|
}
|
|
else if(c==eschar) /* escape */
|
|
{
|
|
nc=line[(*next)+1];
|
|
if(nc==0) /* end of line */
|
|
{
|
|
*brkused=0;
|
|
chstore(token,tokmax,c,&_p_tokpos,_p_state,_p_flag);
|
|
++(*next);
|
|
goto byebye;
|
|
}
|
|
switch(_p_state)
|
|
{
|
|
case IN_WHITE:
|
|
--(*next);
|
|
_p_state=IN_TOKEN;
|
|
break;
|
|
|
|
case IN_TOKEN:
|
|
case IN_QUOTE:
|
|
++(*next);
|
|
chstore(token,tokmax,nc,&_p_tokpos,_p_state,_p_flag);
|
|
break;
|
|
|
|
case IN_OZONE:
|
|
goto byebye;
|
|
}
|
|
}
|
|
else /* anything else is just a real character */
|
|
{
|
|
switch(_p_state)
|
|
{
|
|
case IN_WHITE:
|
|
_p_state=IN_TOKEN; /* switch states */
|
|
|
|
case IN_TOKEN: /* these 2 are */
|
|
case IN_QUOTE: /* identical here */
|
|
chstore(token,tokmax,c,&_p_tokpos,_p_state,_p_flag);
|
|
break;
|
|
|
|
case IN_OZONE:
|
|
goto byebye;
|
|
}
|
|
}
|
|
} /* end of main loop */
|
|
|
|
byebye:
|
|
token[_p_tokpos]=0; /* make sure token ends with EOS */
|
|
|
|
return 0;
|
|
|
|
}
|