Who remember flex: fast lexical analyzer generator?
I have written a short mail logfile scanner sample long time ago under gnu linux with gcc and flex. MailScanner.yy is a win32 port using gnuwin32 flex and getoptwin:
%option noyywrap
%{
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "getopt.h"
#include <string.h>
#define MAXLEN 1024
#define OUT (void)printf
int i, j, idx, len, mode = 0;
char tmps[MAXLEN], reverse[MAXLEN];
%}
SEGA [2][5][0-5]
SEGB [2][0-4][0-9]
SEGC [1][0-9]{2}
SEGD [1-9][0-9]{0,1}
SEG {SEGA}|{SEGB}|{SEGC}|{SEGD}
IP {SEG}["."]{SEG}["."]{SEG}["."]{SEG}
HOSTDOMAINSEGMENT [0-9a-zA-Z_"\-"]+["."]
TOPLEVELDOMAIN [a-zA-Z]{2,7}
HOSTNAME {HOSTDOMAINSEGMENT}+{TOPLEVELDOMAIN}
USER [0-9A-Za-z_"\-""."]+
EMAIL1 {USER}"@"{HOSTNAME}
EMAIL2 {USER}"@"{IP}
URIPROTOCOL [a-zA-Z]{2,10}"://"
URISUFFIX [^ \t\n\r"@"","">""<""("")""{""}"]
URL1 {URIPROTOCOL}{HOSTNAME}{URISUFFIX}*
URL2 {URIPROTOCOL}{IP}{URISUFFIX}*
%%
<<EOF>> {
exit(1);
}
{EMAIL1} |
{EMAIL2} {
if (strchr(yytext, '@') != (char *)NULL)
{
switch((mode % 16))
{
case 0:
strcpy(tmps, yytext);
break;
case 1:
strcpy(tmps, strchr(yytext, (int)'@'));
break;
case 2:
strcpy(tmps, &strchr(yytext, (int)'@')[1]);
break;
case 4:
strcpy(tmps, &strchr(yytext, (int)'@')[1]);
len = strlen(tmps);
for (j = 0, idx = 0; ((j < len) && (j < MAXLEN-1)); j++)
{
if (tmps[j] == '.')
{
for (i = idx; i <= j;
reverse[(len-j) + (i-idx)] = tmps[i++]);
idx = j + 1;
}
}
for (i = idx; i <= j; i++)
{
reverse[(len-j) + (i-idx)] = (i < len) ? tmps[i] : '.';
}
reverse[len + 1] = '\0';
strcpy(tmps, reverse);
break;
case 8:
strcpy(tmps, &strrchr(yytext, (int)'.')[1]);
break;
default:
strcpy(tmps, yytext);
break;
}
OUT("%s\n", tmps);
}
}
{URL1} |
{URL2} {
if (mode < 16)
{
switch((mode % 16))
{
case 0:
strcpy(tmps, yytext);
break;
case 1:
strcpy(tmps, strchr(yytext, (int)'/'));
break;
case 2:
strcpy(tmps, &strrchr(yytext, (int)'/')[1]);
break;
case 4:
strcpy(tmps, &strrchr(yytext, (int)'/')[1]);
len = strlen(tmps);
for (j = 0, idx = 0; ((j < len) && (j < MAXLEN-1)); j++)
{
if (tmps[j] == '.')
{
for (i = idx; i <= j; reverse[(len-j) + (i-idx)] = tmps[i++]);
idx = j + 1;
}
}
for (i = idx; i <= j; i++)
{
reverse[(len-j) + (i-idx)] = (i < len) ? tmps[i] : '.';
}
reverse[len + 1] = '\0';
strcpy(tmps, reverse);
break;
case 8:
strcpy(tmps, &strrchr(yytext, (int)'.')[1]);
break;
default:
strcpy(tmps, yytext);
break;
}
OUT("%s\n", tmps);
}
}
^[\n;] { ; }
[\r\n]+ { ; }
. { ; }
%%
void yyerror() { exit(1); }
void usage(const char *cmd)
{
OUT("Usage: %s [-f file] [-a ] [ -r ] [ -u ]\n", cmd);
OUT("\tsimple email address and uri lexer reads from stdin \n");
OUT("\t-a,--noat \tprints FQDN email (chars left of \'@\')\n");
OUT("\t-u,--nouser \tprints email without username \n");
OUT("\t-t,--top \tprints topleveldomain with option -a|-u\n");
OUT("\t-n,--nouris \tprints only email address and not URIs\n");
OUT("\t-r,--reverse\treverses FQDB/IP address segments\n");
exit(0);
}
int _tmain(int argc, TCHAR** argv)
{
static int verbose_flag;
int c;
while(1)
{
static struct option long_options[] =
{
{_T("help"), ARG_NONE, 0, _T('h')},
{_T("noat"), ARG_NONE, 0, _T('a')},
{_T("nouser"), ARG_NONE, 0, _T('u')},
{_T("top"), ARG_NONE, 0, _T('t')},
{_T("nouris"), ARG_NONE, 0, _T('n')},
{_T("reverse"), ARG_NONE, 0, _T('r')},
{ ARG_NULL, ARG_NULL, ARG_NULL, ARG_NULL}
};
int option_index = 0;
c = getopt_long(argc, argv, _T("hautnr:"), long_options, &option_index);
if (c == -1)
break;
switch (c) // Handle options
{
case 0: // If this option set a flag, do nothing else now.
if (long_options[option_index].flag != 0)
break;
_tprintf (_T("option %s"), long_options[option_index].name);
if (optarg)
_tprintf (_T(" with arg %s"), optarg);
_tprintf (_T("\n"));
break;
case _T('u'): mode = 1;
break;
case _T('a'): mode = 2;
break;
case _T('r'): mode = 4;
break;
case _T('h'): usage(argv[0]);
break;
case _T('t'): mode = 8;
break;
case _T('n'): mode += 16;
break;
case '?': // getopt_long already printed an error message.
break;
default: abort();
}
}
(void) fflush(stdout);
yyin = stdin;
yylex();
exit(0);
}
Files: [MailScanner.yy] [lex.yy.c]
%option noyywrap
%{
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "getopt.h"
#include <string.h>
#define MAXLEN 1024
#define OUT (void)printf
#define OUT (void)printf
int i, j, idx, len, mode = 0;
char tmps[MAXLEN], reverse[MAXLEN];
%}
SEGA [2][5][0-5]
SEGB [2][0-4][0-9]
SEGC [1][0-9]{2}
SEGD [1-9][0-9]{0,1}
SEG {SEGA}|{SEGB}|{SEGC}|{SEGD}
IP {SEG}["."]{SEG}["."]{SEG}["."]{SEG}
HOSTDOMAINSEGMENT [0-9a-zA-Z_"\-"]+["."]
TOPLEVELDOMAIN [a-zA-Z]{2,7}
HOSTNAME {HOSTDOMAINSEGMENT}+{TOPLEVELDOMAIN}
USER [0-9A-Za-z_"\-""."]+
EMAIL1 {USER}"@"{HOSTNAME}
EMAIL2 {USER}"@"{IP}
URIPROTOCOL [a-zA-Z]{2,10}"://"
URISUFFIX [^ \t\n\r"@"","">""<""("")""{""}"]
URL1 {URIPROTOCOL}{HOSTNAME}{URISUFFIX}*
URL2 {URIPROTOCOL}{IP}{URISUFFIX}*
%%
<<EOF>> {
exit(1);
}
{EMAIL1} |
{EMAIL2} {
if (strchr(yytext, '@') != (char *)NULL)
{
switch((mode % 16))
{
case 0:
strcpy(tmps, yytext);
break;
{
case 0:
strcpy(tmps, yytext);
break;
case 1:
strcpy(tmps, strchr(yytext, (int)'@'));
break;
strcpy(tmps, strchr(yytext, (int)'@'));
break;
case 2:
strcpy(tmps, &strchr(yytext, (int)'@')[1]);
break;
strcpy(tmps, &strchr(yytext, (int)'@')[1]);
break;
case 4:
strcpy(tmps, &strchr(yytext, (int)'@')[1]);
strcpy(tmps, &strchr(yytext, (int)'@')[1]);
len = strlen(tmps);
for (j = 0, idx = 0; ((j < len) && (j < MAXLEN-1)); j++)
{
if (tmps[j] == '.')
{
for (i = idx; i <= j;
reverse[(len-j) + (i-idx)] = tmps[i++]);
idx = j + 1;
}
}
for (i = idx; i <= j; i++)
{
reverse[(len-j) + (i-idx)] = (i < len) ? tmps[i] : '.';
{
reverse[(len-j) + (i-idx)] = (i < len) ? tmps[i] : '.';
}
reverse[len + 1] = '\0';
strcpy(tmps, reverse);
break;
case 8:
strcpy(tmps, &strrchr(yytext, (int)'.')[1]);
break;
strcpy(tmps, &strrchr(yytext, (int)'.')[1]);
break;
default:
strcpy(tmps, yytext);
break;
strcpy(tmps, yytext);
break;
}
OUT("%s\n", tmps);
}
}
}
{URL1} |
{URL2} {
if (mode < 16)
if (mode < 16)
{
switch((mode % 16))
{
case 0:
strcpy(tmps, yytext);
break;
strcpy(tmps, yytext);
break;
case 1:
strcpy(tmps, strchr(yytext, (int)'/'));
break;
strcpy(tmps, strchr(yytext, (int)'/'));
break;
case 2:
strcpy(tmps, &strrchr(yytext, (int)'/')[1]);
break;
strcpy(tmps, &strrchr(yytext, (int)'/')[1]);
break;
case 4:
strcpy(tmps, &strrchr(yytext, (int)'/')[1]);
strcpy(tmps, &strrchr(yytext, (int)'/')[1]);
len = strlen(tmps);
for (j = 0, idx = 0; ((j < len) && (j < MAXLEN-1)); j++)
{
if (tmps[j] == '.')
{
for (i = idx; i <= j; reverse[(len-j) + (i-idx)] = tmps[i++]);
idx = j + 1;
}
}
for (i = idx; i <= j; i++)
{
{
reverse[(len-j) + (i-idx)] = (i < len) ? tmps[i] : '.';
}
}
reverse[len + 1] = '\0';
strcpy(tmps, reverse);
break;
case 8:
strcpy(tmps, &strrchr(yytext, (int)'.')[1]);
break;
strcpy(tmps, &strrchr(yytext, (int)'.')[1]);
break;
default:
strcpy(tmps, yytext);
break;
strcpy(tmps, yytext);
break;
}
OUT("%s\n", tmps);
}
}
}
^[\n;] { ; }
[\r\n]+ { ; }
. { ; }
%%
void yyerror() { exit(1); }
void usage(const char *cmd)
{
OUT("Usage: %s [-f file] [-a ] [ -r ] [ -u ]\n", cmd);
OUT("\tsimple email address and uri lexer reads from stdin \n");
OUT("\t-a,--noat \tprints FQDN email (chars left of \'@\')\n");
OUT("\t-u,--nouser \tprints email without username \n");
OUT("\t-t,--top \tprints topleveldomain with option -a|-u\n");
OUT("\t-n,--nouris \tprints only email address and not URIs\n");
OUT("\t-r,--reverse\treverses FQDB/IP address segments\n");
exit(0);
}
int _tmain(int argc, TCHAR** argv)
{
static int verbose_flag;
int c;
while(1)
{
static struct option long_options[] =
{
{_T("help"), ARG_NONE, 0, _T('h')},
{_T("noat"), ARG_NONE, 0, _T('a')},
{_T("nouser"), ARG_NONE, 0, _T('u')},
{_T("top"), ARG_NONE, 0, _T('t')},
{_T("nouris"), ARG_NONE, 0, _T('n')},
{_T("reverse"), ARG_NONE, 0, _T('r')},
{ ARG_NULL, ARG_NULL, ARG_NULL, ARG_NULL}
};
int option_index = 0;
c = getopt_long(argc, argv, _T("hautnr:"), long_options, &option_index);
if (c == -1)
break;
switch (c) // Handle options
{
case 0: // If this option set a flag, do nothing else now.
case 0: // If this option set a flag, do nothing else now.
if (long_options[option_index].flag != 0)
break;
_tprintf (_T("option %s"), long_options[option_index].name);
if (optarg)
_tprintf (_T(" with arg %s"), optarg);
_tprintf (_T("\n"));
break;
case _T('u'): mode = 1;
break;
break;
case _T('a'): mode = 2;
break;
break;
case _T('r'): mode = 4;
break;
break;
case _T('h'): usage(argv[0]);
break;
break;
case _T('t'): mode = 8;
break;
break;
case _T('n'): mode += 16;
break;
break;
case '?': // getopt_long already printed an error message.
break;
break;
default: abort();
}
}
(void) fflush(stdout);
yyin = stdin;
yylex();
exit(0);
}
}
Files: [MailScanner.yy] [lex.yy.c]
Keine Kommentare:
Kommentar veröffentlichen