CB32 1.2 sources

2014-11-26 13:59:13 -06:00 · 2014-11-26 13:59:13 -06:00 · fd87f89a19
commit fd87f89a19
parent cf25bc7627
100 changed files with 14543 additions and 0 deletions
--- a/Parser.cpp
+++ b/Parser.cpp
@ -0,0 +1,445 @@
+#ifdef IGNORE_THIS
+
+/*
+	PARSER.C	Lloyd Zusman, Master Byte Software, Trump User Group
+			(408) 395-5693 (voice only)
+
+	This program is a generalized, finite state token parser.  It's
+	it is the most powerful parser I've seen on any BBS (if I do say
+	so myself).  It allows you extract tokens one at a time from a
+	string of characters.  The characters used for white space, for
+	break characters, and for quotes can be specified.  Also,
+	characters in the string can be preceded by a specifiable escape
+	character which removes any special meaning the character may have.
+
+	There are a lot of formal parameters in this subroutine call, but
+	once you get familiar with them, this routine is fairly easy to use.
+	"#define" macros can be used to generate simpler looking calls for
+	commonly used applications of this routine.
+
+	First, some terminology:
+
+	token			used here, a single unit of information in
+				the form of a group of characters.
+
+	white space		space that gets ignored (except within quotes
+				or when escaped), like blanks and tabs.  in
+				addition, white space terminates a non-quoted
+				token.
+
+	break character		a character that separates non-quoted tokens.
+				commas are a common break character.  the
+				usage of break characters to signal the end
+				of a token is the same as that of white space,
+				except multiple break characters with nothing
+				or only white space between generate a null
+				token for each two break characters together.
+
+				for example, if blank is set to be the white 
+				space and comma is set to be the break
+				character, the line ...
+
+				A, B, C ,  , DEF
+
+				... consists of 5 tokens:
+
+				1)	"A"
+				2)	"B"
+				3)	"C"
+				4)	""	(the null string)
+				5)	"DEF"
+
+	quote character		a character that, when surrounding a group
+				of other characters, causes the group of
+				characters to be treated as a single token,
+				no matter how many white spaces or break
+				characters exist in the group.  also, a
+				token always terminates after the closing
+				quote.  for example, if ' is the quote
+				character, blank is white space, and comma
+				is the break character, the following
+				string ...
+
+				A, ' B, CD'EF GHI
+
+				... consists of 4 tokens:
+
+				1)	"A"
+				2)	" B, CD" (note the blanks & comma)
+				3)	"EF"
+				4)	"GHI"
+
+				the quote characters themselves do
+				not appear in the resultant tokens.  the
+				double quotes are delimiters i use here for
+				documentation purposes only.
+
+	escape character	a character which itself is ignored but
+				which causes the next character to be
+				used as is.  ^ and \ are often used as
+				escape characters.  an escape in the last
+				position of the string gets treated as a
+				"normal" (i.e., non-quote, non-white, 
+				non-break, and non-escape) character.
+				for example, assume white space, break
+				character, and quote are the same as in the
+				above examples, and further, assume that
+				^ is the escape character.  then, in the
+				string ...
+
+				ABC, ' DEF ^' GH' I ^ J K^ L ^
+
+				... there are 7 tokens:
+
+				1)	"ABC"
+				2)	" DEF ' GH"
+				3)	"I"
+				4)	" "	(a lone blank)
+				5)	"J"
+				6)	"K L"
+				7)	"^"	(passed as is at end of line)
+
+
+	OK, now that you have this background, here's how to call "parser":
+
+	result=parser(flag,token,maxtok,string,white,break,quote,escape,
+		      brkused,next,quoted)
+
+	result:		0 if we haven't reached EOS (end of string), and
+			1 if we have (this is an "int").
+
+	flag:		right now, only the low order 3 bits are used.
+			1 => convert non-quoted tokens to upper case
+			2 => convert non-quoted tokens to lower case
+			0 => do not convert non-quoted tokens
+			(this is a "char").
+
+	token:		a character string containing the returned next token
+			(this is a "char[]").
+
+	maxtok:		the maximum size of "token".  characters beyond
+			"maxtok" are truncated (this is an "int").
+
+	string:		the string to be parsed (this is a "char[]").
+
+	white:		a string of the valid white spaces.  example:
+
+			char whitesp[]={" \t"};
+
+			blank and tab will be valid white space (this is
+			a "char[]").
+
+	break:		a string of the valid break characters.  example:
+
+			char breakch[]={";,"};
+
+			semicolon and comma will be valid break characters
+			(this is a "char[]").
+			
+			IMPORTANT:  do not use the name "break" as a C
+			variable, as this is a reserved word in C.
+
+	quote:		a string of the valid quote characters.  an example
+			would be
+
+			char whitesp[]={"'\"");
+
+			(this causes single and double quotes to be valid)
+			note that a token starting with one of these characters
+			needs the same quote character to terminate it.
+
+			for example, 
+
+			"ABC '
+			
+			is unterminated, but
+
+			"DEF" and 'GHI'
+
+			are properly terminated.  note that different quote
+			characters can appear on the same line; only for
+			a given token do the quote characters have to be
+			the same (this is a "char[]").
+
+	escape:		the escape character (NOT a string ... only one
+			allowed).  use zero if none is desired (this is
+			a "char").
+
+	brkused:	the break character used to terminate the current
+			token.  if the token was quoted, this will be the
+			quote used.  if the token is the last one on the
+			line, this will be zero (this is a pointer to a
+			"char").
+
+	next:		this variable points to the first character of the
+			next token.  it gets reset by "parser" as it steps
+			through the string.  set it to 0 upon initialization,
+			and leave it alone after that.  you can change it
+			if you want to jump around in the string or re-parse
+			from the beginning, but be careful (this is a
+			pointer to an "int").
+
+	quoted:		set to 1 (true) if the token was quoted and 0 (false)
+			if not.  you may need this information (for example:
+			in C, a string with quotes around it is a character
+			string, while one without is an identifier).
+
+			(this is a pointer to a "char").
+
+	Example 1:
+
+	char *whitesp[]={" \t");	// blank and tab 
+	char *breakch[]={",\r");	// comma and carriage return
+	char *quotech[]={"'\""};	// single and double quote
+	char escape='^';		// "uparrow" is escape
+
+	main()
+	{
+	  char *fgets(),line[81],brkused,quoted,token[81];
+	  int i,next;
+
+	  while(fgets(line,80,stdin)!=NULL)	// get line
+	  {
+
+	    printf("Line: %s",line);		// already has <CR>
+	    i=0;
+
+	    next=0;				// make sure you do this
+
+	    while(parser(2,token,80,line,whitesp,breakch,quotech,escape,
+			 &brkused,&next,&quoted)==0)
+	    {
+	      printf(" Token %d = (%s)\n",++i,token);
+
+	      if(brkchar=='\r')	// <CR> is a break so it won't be included
+			break;			// in the token.  treat as end-of-line here
+	    }
+	  }
+	}
+
+
+
+	In the above example, lines are read from stdin and broken up into
+	tokens.  All non-quoted tokens are converted to lower case.  Since
+	fgets() returns the final carriage return, we treat it as a break
+	character to keep it out of the returned token.  Also, since the only
+	way "parser" will return a non-zero error code is at end of line,
+	we test "brkchar" to see if we've gotten to the final carriage
+	return, and we explicitly break out of the inner loop if we've
+	hit it.  Note that since fgets() puts the final <CR> right before
+	the end-of-string, if we left out the "if(brkchar='\r')" test,
+	we'd get one extra null token (just as if the line ended with a
+	single comma).  Run this example to see how it all works.
+
+	Example 2:
+
+		.
+		.
+		.
+
+	next=0;
+	result=parser(1, newstr, 80, str, "", "", "", 0, &brkused, &next,
+		      &quoted);
+		.
+		.
+		.
+	
+	this call takes whatever is in "str" and converts it to upper case,
+	putting the result in "newstr".
+
+	*** end of examples ***
+
+	in case you're interested, "parser.c" was inspired by a system
+	subroutine that comes as part of the PRIMOS operating system for
+	the Prime Computer:  "gt$par.plp".  i loosely patterned this routine
+	after the Prime routine.
+
+	Revisions:
+
+	09/30/84	Lloyd Zusman	Initial coding
+
+*/
+
+#endif
+
+#include "stdafx.h"
+#include <ctype.h>
+#include "parser.h"
+
+
+////////////////////////////////////////////////////////////////
+// routine to find character in string
+int sindex(char ch,char *string)
+{
+  char *cp;
+  for(cp=string;*cp;++cp)
+    if(ch==*cp)
+      return (int)(cp-string);  // return postion of character
+  return -1;                    // eol ... no match found
+}
+    
+///////////////////////////////////////////////////////////////
+// routine to store a character in a string
+void chstore(char *string,int max,char ch, int *pos, int state,
+			 int flag)
+{
+	char c;
+	if(*pos >= 0 && *pos < max-1)
+	{
+		if(state==IN_QUOTE)
+			c=ch;
+		else
+			switch(flag)
+			{
+				case 1:             /* convert to upper */
+					c=toupper(ch);
+					break;
+  
+				case 2:             /* convert to lower */
+					c=tolower(ch);
+					break;
+      
+				default:            /* use as is */
+					c=ch;
+					break;
+			}
+		string[(*pos)++]=c;
+	}
+	return;
+}
+  
+//////////////////////////////////////////////////////////////////
+// parse a given string for a token
+int parser(int inflag,char *token,int tokmax,char *line,char *white,
+	   char *brkchar,char *quote,char eschar,char *brkused,int *next,
+	   char *quoted)
+{
+	int			_p_tokpos;		// current token pos
+	int			_p_state;		// current state
+	int			_p_flag;		// option flag
+	char		_p_curquote;	// current quote char
+	int			qp;
+	char		c,nc;
+          
+	*brkused=0;           /* initialize to null */	  
+	*quoted=0;		/* assume not quoted  */
+
+	if(!line[*next])      /* if we're at end of line, indicate such */
+		return 1;
+
+	_p_state=IN_WHITE;       /* initialize state */
+	_p_curquote=0;           /* initialize previous quote char */
+	_p_flag=inflag;          /* set option flag */
+
+	for(_p_tokpos=0;c=line[*next];++(*next))      /* main loop */
+	{
+		if((qp=sindex(c,brkchar))>=0)  /* break */
+		{
+		switch(_p_state)
+		{
+			case IN_WHITE:          /* these are the same here ...	*/
+			case IN_TOKEN:          /* ... just get out		*/
+			case IN_OZONE:		/* ditto			*/
+				++(*next);
+				*brkused=brkchar[qp];
+				goto byebye;
+        
+			case IN_QUOTE:           /* just keep going */
+				chstore(token,tokmax,c,&_p_tokpos,_p_state,_p_flag);
+				break;
+		}
+	}
+	else if((qp=sindex(c,quote))>=0)  /* quote */
+	{
+		switch(_p_state)
+		{
+			case IN_WHITE:   /* these are identical, */
+				_p_state=IN_QUOTE;        /* change states   */
+				_p_curquote=quote[qp];         /* save quote char */
+				*quoted=1;	/* set to true as long as something is in quotes */
+				break;
+  
+			case IN_QUOTE:
+				if(quote[qp]==_p_curquote)	/* same as the beginning quote? */
+				{
+					_p_state=IN_OZONE;
+					_p_curquote=0;
+				}
+				else
+					chstore(token,tokmax,c,&_p_tokpos,_p_state,_p_flag);	/* treat as regular char */
+				break;
+
+			case IN_TOKEN:
+			case IN_OZONE:
+				*brkused=c;			/* uses quote as break char */
+				goto byebye;
+		}
+	}
+	else if((qp=sindex(c,white))>=0)       /* white */
+	{
+		switch(_p_state)
+		{
+			case IN_WHITE:
+			case IN_OZONE:
+				break;		/* keep going */
+          
+			case IN_TOKEN:
+				_p_state=IN_OZONE;
+				break;
+          
+			case IN_QUOTE:
+				chstore(token,tokmax,c,&_p_tokpos,_p_state,_p_flag);     /* it's valid here */
+				break;
+		}
+	}
+	else if(c==eschar)			/* escape */
+	{
+		nc=line[(*next)+1];
+		if(nc==0)			/* end of line */
+		{
+			*brkused=0;
+			chstore(token,tokmax,c,&_p_tokpos,_p_state,_p_flag);
+			++(*next);
+			goto byebye;
+		}
+		switch(_p_state)
+		{
+			case IN_WHITE:
+				--(*next);
+				_p_state=IN_TOKEN;
+				break;
+
+			case IN_TOKEN:
+			case IN_QUOTE:
+				++(*next);
+				chstore(token,tokmax,nc,&_p_tokpos,_p_state,_p_flag);
+				break;
+
+			case IN_OZONE:
+				goto byebye;
+		}
+	}
+	else        /* anything else is just a real character */
+	{
+		switch(_p_state)
+		{
+			case IN_WHITE:
+				_p_state=IN_TOKEN;        /* switch states */
+          
+			case IN_TOKEN:           /* these 2 are     */
+			case IN_QUOTE:           /*  identical here */
+				chstore(token,tokmax,c,&_p_tokpos,_p_state,_p_flag);
+				break;
+
+			case IN_OZONE:
+				goto byebye;
+		}
+	}
+}             /* end of main loop */
+
+byebye:
+	token[_p_tokpos]=0;   /* make sure token ends with EOS */
+  
+	return 0;
+  
+}