CB32 1.2 sources
This commit is contained in:
parent
cf25bc7627
commit
fd87f89a19
100 changed files with 14543 additions and 0 deletions
445
Parser.cpp
Normal file
445
Parser.cpp
Normal file
|
|
@ -0,0 +1,445 @@
|
|||
#ifdef IGNORE_THIS
|
||||
|
||||
/*
|
||||
PARSER.C Lloyd Zusman, Master Byte Software, Trump User Group
|
||||
(408) 395-5693 (voice only)
|
||||
|
||||
This program is a generalized, finite state token parser. It's
|
||||
it is the most powerful parser I've seen on any BBS (if I do say
|
||||
so myself). It allows you extract tokens one at a time from a
|
||||
string of characters. The characters used for white space, for
|
||||
break characters, and for quotes can be specified. Also,
|
||||
characters in the string can be preceded by a specifiable escape
|
||||
character which removes any special meaning the character may have.
|
||||
|
||||
There are a lot of formal parameters in this subroutine call, but
|
||||
once you get familiar with them, this routine is fairly easy to use.
|
||||
"#define" macros can be used to generate simpler looking calls for
|
||||
commonly used applications of this routine.
|
||||
|
||||
First, some terminology:
|
||||
|
||||
token used here, a single unit of information in
|
||||
the form of a group of characters.
|
||||
|
||||
white space space that gets ignored (except within quotes
|
||||
or when escaped), like blanks and tabs. in
|
||||
addition, white space terminates a non-quoted
|
||||
token.
|
||||
|
||||
break character a character that separates non-quoted tokens.
|
||||
commas are a common break character. the
|
||||
usage of break characters to signal the end
|
||||
of a token is the same as that of white space,
|
||||
except multiple break characters with nothing
|
||||
or only white space between generate a null
|
||||
token for each two break characters together.
|
||||
|
||||
for example, if blank is set to be the white
|
||||
space and comma is set to be the break
|
||||
character, the line ...
|
||||
|
||||
A, B, C , , DEF
|
||||
|
||||
... consists of 5 tokens:
|
||||
|
||||
1) "A"
|
||||
2) "B"
|
||||
3) "C"
|
||||
4) "" (the null string)
|
||||
5) "DEF"
|
||||
|
||||
quote character a character that, when surrounding a group
|
||||
of other characters, causes the group of
|
||||
characters to be treated as a single token,
|
||||
no matter how many white spaces or break
|
||||
characters exist in the group. also, a
|
||||
token always terminates after the closing
|
||||
quote. for example, if ' is the quote
|
||||
character, blank is white space, and comma
|
||||
is the break character, the following
|
||||
string ...
|
||||
|
||||
A, ' B, CD'EF GHI
|
||||
|
||||
... consists of 4 tokens:
|
||||
|
||||
1) "A"
|
||||
2) " B, CD" (note the blanks & comma)
|
||||
3) "EF"
|
||||
4) "GHI"
|
||||
|
||||
the quote characters themselves do
|
||||
not appear in the resultant tokens. the
|
||||
double quotes are delimiters i use here for
|
||||
documentation purposes only.
|
||||
|
||||
escape character a character which itself is ignored but
|
||||
which causes the next character to be
|
||||
used as is. ^ and \ are often used as
|
||||
escape characters. an escape in the last
|
||||
position of the string gets treated as a
|
||||
"normal" (i.e., non-quote, non-white,
|
||||
non-break, and non-escape) character.
|
||||
for example, assume white space, break
|
||||
character, and quote are the same as in the
|
||||
above examples, and further, assume that
|
||||
^ is the escape character. then, in the
|
||||
string ...
|
||||
|
||||
ABC, ' DEF ^' GH' I ^ J K^ L ^
|
||||
|
||||
... there are 7 tokens:
|
||||
|
||||
1) "ABC"
|
||||
2) " DEF ' GH"
|
||||
3) "I"
|
||||
4) " " (a lone blank)
|
||||
5) "J"
|
||||
6) "K L"
|
||||
7) "^" (passed as is at end of line)
|
||||
|
||||
|
||||
OK, now that you have this background, here's how to call "parser":
|
||||
|
||||
result=parser(flag,token,maxtok,string,white,break,quote,escape,
|
||||
brkused,next,quoted)
|
||||
|
||||
result: 0 if we haven't reached EOS (end of string), and
|
||||
1 if we have (this is an "int").
|
||||
|
||||
flag: right now, only the low order 3 bits are used.
|
||||
1 => convert non-quoted tokens to upper case
|
||||
2 => convert non-quoted tokens to lower case
|
||||
0 => do not convert non-quoted tokens
|
||||
(this is a "char").
|
||||
|
||||
token: a character string containing the returned next token
|
||||
(this is a "char[]").
|
||||
|
||||
maxtok: the maximum size of "token". characters beyond
|
||||
"maxtok" are truncated (this is an "int").
|
||||
|
||||
string: the string to be parsed (this is a "char[]").
|
||||
|
||||
white: a string of the valid white spaces. example:
|
||||
|
||||
char whitesp[]={" \t"};
|
||||
|
||||
blank and tab will be valid white space (this is
|
||||
a "char[]").
|
||||
|
||||
break: a string of the valid break characters. example:
|
||||
|
||||
char breakch[]={";,"};
|
||||
|
||||
semicolon and comma will be valid break characters
|
||||
(this is a "char[]").
|
||||
|
||||
IMPORTANT: do not use the name "break" as a C
|
||||
variable, as this is a reserved word in C.
|
||||
|
||||
quote: a string of the valid quote characters. an example
|
||||
would be
|
||||
|
||||
char whitesp[]={"'\"");
|
||||
|
||||
(this causes single and double quotes to be valid)
|
||||
note that a token starting with one of these characters
|
||||
needs the same quote character to terminate it.
|
||||
|
||||
for example,
|
||||
|
||||
"ABC '
|
||||
|
||||
is unterminated, but
|
||||
|
||||
"DEF" and 'GHI'
|
||||
|
||||
are properly terminated. note that different quote
|
||||
characters can appear on the same line; only for
|
||||
a given token do the quote characters have to be
|
||||
the same (this is a "char[]").
|
||||
|
||||
escape: the escape character (NOT a string ... only one
|
||||
allowed). use zero if none is desired (this is
|
||||
a "char").
|
||||
|
||||
brkused: the break character used to terminate the current
|
||||
token. if the token was quoted, this will be the
|
||||
quote used. if the token is the last one on the
|
||||
line, this will be zero (this is a pointer to a
|
||||
"char").
|
||||
|
||||
next: this variable points to the first character of the
|
||||
next token. it gets reset by "parser" as it steps
|
||||
through the string. set it to 0 upon initialization,
|
||||
and leave it alone after that. you can change it
|
||||
if you want to jump around in the string or re-parse
|
||||
from the beginning, but be careful (this is a
|
||||
pointer to an "int").
|
||||
|
||||
quoted: set to 1 (true) if the token was quoted and 0 (false)
|
||||
if not. you may need this information (for example:
|
||||
in C, a string with quotes around it is a character
|
||||
string, while one without is an identifier).
|
||||
|
||||
(this is a pointer to a "char").
|
||||
|
||||
Example 1:
|
||||
|
||||
char *whitesp[]={" \t"); // blank and tab
|
||||
char *breakch[]={",\r"); // comma and carriage return
|
||||
char *quotech[]={"'\""}; // single and double quote
|
||||
char escape='^'; // "uparrow" is escape
|
||||
|
||||
main()
|
||||
{
|
||||
char *fgets(),line[81],brkused,quoted,token[81];
|
||||
int i,next;
|
||||
|
||||
while(fgets(line,80,stdin)!=NULL) // get line
|
||||
{
|
||||
|
||||
printf("Line: %s",line); // already has <CR>
|
||||
i=0;
|
||||
|
||||
next=0; // make sure you do this
|
||||
|
||||
while(parser(2,token,80,line,whitesp,breakch,quotech,escape,
|
||||
&brkused,&next,"ed)==0)
|
||||
{
|
||||
printf(" Token %d = (%s)\n",++i,token);
|
||||
|
||||
if(brkchar=='\r') // <CR> is a break so it won't be included
|
||||
break; // in the token. treat as end-of-line here
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
In the above example, lines are read from stdin and broken up into
|
||||
tokens. All non-quoted tokens are converted to lower case. Since
|
||||
fgets() returns the final carriage return, we treat it as a break
|
||||
character to keep it out of the returned token. Also, since the only
|
||||
way "parser" will return a non-zero error code is at end of line,
|
||||
we test "brkchar" to see if we've gotten to the final carriage
|
||||
return, and we explicitly break out of the inner loop if we've
|
||||
hit it. Note that since fgets() puts the final <CR> right before
|
||||
the end-of-string, if we left out the "if(brkchar='\r')" test,
|
||||
we'd get one extra null token (just as if the line ended with a
|
||||
single comma). Run this example to see how it all works.
|
||||
|
||||
Example 2:
|
||||
|
||||
.
|
||||
.
|
||||
.
|
||||
|
||||
next=0;
|
||||
result=parser(1, newstr, 80, str, "", "", "", 0, &brkused, &next,
|
||||
"ed);
|
||||
.
|
||||
.
|
||||
.
|
||||
|
||||
this call takes whatever is in "str" and converts it to upper case,
|
||||
putting the result in "newstr".
|
||||
|
||||
*** end of examples ***
|
||||
|
||||
in case you're interested, "parser.c" was inspired by a system
|
||||
subroutine that comes as part of the PRIMOS operating system for
|
||||
the Prime Computer: "gt$par.plp". i loosely patterned this routine
|
||||
after the Prime routine.
|
||||
|
||||
Revisions:
|
||||
|
||||
09/30/84 Lloyd Zusman Initial coding
|
||||
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
#include "stdafx.h"
|
||||
#include <ctype.h>
|
||||
#include "parser.h"
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// routine to find character in string
|
||||
int sindex(char ch,char *string)
|
||||
{
|
||||
char *cp;
|
||||
for(cp=string;*cp;++cp)
|
||||
if(ch==*cp)
|
||||
return (int)(cp-string); // return postion of character
|
||||
return -1; // eol ... no match found
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// routine to store a character in a string
|
||||
void chstore(char *string,int max,char ch, int *pos, int state,
|
||||
int flag)
|
||||
{
|
||||
char c;
|
||||
if(*pos >= 0 && *pos < max-1)
|
||||
{
|
||||
if(state==IN_QUOTE)
|
||||
c=ch;
|
||||
else
|
||||
switch(flag)
|
||||
{
|
||||
case 1: /* convert to upper */
|
||||
c=toupper(ch);
|
||||
break;
|
||||
|
||||
case 2: /* convert to lower */
|
||||
c=tolower(ch);
|
||||
break;
|
||||
|
||||
default: /* use as is */
|
||||
c=ch;
|
||||
break;
|
||||
}
|
||||
string[(*pos)++]=c;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// parse a given string for a token
|
||||
int parser(int inflag,char *token,int tokmax,char *line,char *white,
|
||||
char *brkchar,char *quote,char eschar,char *brkused,int *next,
|
||||
char *quoted)
|
||||
{
|
||||
int _p_tokpos; // current token pos
|
||||
int _p_state; // current state
|
||||
int _p_flag; // option flag
|
||||
char _p_curquote; // current quote char
|
||||
int qp;
|
||||
char c,nc;
|
||||
|
||||
*brkused=0; /* initialize to null */
|
||||
*quoted=0; /* assume not quoted */
|
||||
|
||||
if(!line[*next]) /* if we're at end of line, indicate such */
|
||||
return 1;
|
||||
|
||||
_p_state=IN_WHITE; /* initialize state */
|
||||
_p_curquote=0; /* initialize previous quote char */
|
||||
_p_flag=inflag; /* set option flag */
|
||||
|
||||
for(_p_tokpos=0;c=line[*next];++(*next)) /* main loop */
|
||||
{
|
||||
if((qp=sindex(c,brkchar))>=0) /* break */
|
||||
{
|
||||
switch(_p_state)
|
||||
{
|
||||
case IN_WHITE: /* these are the same here ... */
|
||||
case IN_TOKEN: /* ... just get out */
|
||||
case IN_OZONE: /* ditto */
|
||||
++(*next);
|
||||
*brkused=brkchar[qp];
|
||||
goto byebye;
|
||||
|
||||
case IN_QUOTE: /* just keep going */
|
||||
chstore(token,tokmax,c,&_p_tokpos,_p_state,_p_flag);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if((qp=sindex(c,quote))>=0) /* quote */
|
||||
{
|
||||
switch(_p_state)
|
||||
{
|
||||
case IN_WHITE: /* these are identical, */
|
||||
_p_state=IN_QUOTE; /* change states */
|
||||
_p_curquote=quote[qp]; /* save quote char */
|
||||
*quoted=1; /* set to true as long as something is in quotes */
|
||||
break;
|
||||
|
||||
case IN_QUOTE:
|
||||
if(quote[qp]==_p_curquote) /* same as the beginning quote? */
|
||||
{
|
||||
_p_state=IN_OZONE;
|
||||
_p_curquote=0;
|
||||
}
|
||||
else
|
||||
chstore(token,tokmax,c,&_p_tokpos,_p_state,_p_flag); /* treat as regular char */
|
||||
break;
|
||||
|
||||
case IN_TOKEN:
|
||||
case IN_OZONE:
|
||||
*brkused=c; /* uses quote as break char */
|
||||
goto byebye;
|
||||
}
|
||||
}
|
||||
else if((qp=sindex(c,white))>=0) /* white */
|
||||
{
|
||||
switch(_p_state)
|
||||
{
|
||||
case IN_WHITE:
|
||||
case IN_OZONE:
|
||||
break; /* keep going */
|
||||
|
||||
case IN_TOKEN:
|
||||
_p_state=IN_OZONE;
|
||||
break;
|
||||
|
||||
case IN_QUOTE:
|
||||
chstore(token,tokmax,c,&_p_tokpos,_p_state,_p_flag); /* it's valid here */
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if(c==eschar) /* escape */
|
||||
{
|
||||
nc=line[(*next)+1];
|
||||
if(nc==0) /* end of line */
|
||||
{
|
||||
*brkused=0;
|
||||
chstore(token,tokmax,c,&_p_tokpos,_p_state,_p_flag);
|
||||
++(*next);
|
||||
goto byebye;
|
||||
}
|
||||
switch(_p_state)
|
||||
{
|
||||
case IN_WHITE:
|
||||
--(*next);
|
||||
_p_state=IN_TOKEN;
|
||||
break;
|
||||
|
||||
case IN_TOKEN:
|
||||
case IN_QUOTE:
|
||||
++(*next);
|
||||
chstore(token,tokmax,nc,&_p_tokpos,_p_state,_p_flag);
|
||||
break;
|
||||
|
||||
case IN_OZONE:
|
||||
goto byebye;
|
||||
}
|
||||
}
|
||||
else /* anything else is just a real character */
|
||||
{
|
||||
switch(_p_state)
|
||||
{
|
||||
case IN_WHITE:
|
||||
_p_state=IN_TOKEN; /* switch states */
|
||||
|
||||
case IN_TOKEN: /* these 2 are */
|
||||
case IN_QUOTE: /* identical here */
|
||||
chstore(token,tokmax,c,&_p_tokpos,_p_state,_p_flag);
|
||||
break;
|
||||
|
||||
case IN_OZONE:
|
||||
goto byebye;
|
||||
}
|
||||
}
|
||||
} /* end of main loop */
|
||||
|
||||
byebye:
|
||||
token[_p_tokpos]=0; /* make sure token ends with EOS */
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
Reference in a new issue