htmlmodify.l
上传用户:seven77cht
上传日期:2007-01-04
资源大小:486k
文件大小:15k
- W [ trn]
- F [-a-z0-9$_.!*(),%;/?:@&=+~|#]
- K [a-z0-9-]
- %x DOCTYPE
- %x COMMENT COMMENT_BAD
- %x TAG_START TAG TAG_ATTR_KEY TAG_ATTR_VAL
- %x DQUOTED SQUOTED
- %{
- /***************************************
- $Header: /home/amb/wwwoffle/RCS/htmlmodify.l 1.14 2000/03/22 18:34:19 amb Exp $
- WWWOFFLE - World Wide Web Offline Explorer - Version 2.5e.
- Parse the HTML and modify the source.
- ******************/ /******************
- Written by Andrew M. Bishop
- This file Copyright 1997,98,99,2000 Andrew M. Bishop
- It may be distributed under the GNU Public License, version 2, or
- any higher version. See section COPYING of the GNU Public license
- for conditions under which this file may be redistributed.
- ***************************************/
- #include <stdlib.h>
- #include <string.h>
- #include <ctype.h>
- #include <sys/stat.h>
- #include <unistd.h>
- #include <time.h>
- #include "wwwoffle.h"
- #include "document.h"
- #include "config.h"
- #include "misc.h"
- /* Parser outputs */
- #define LEX_PLAINTEXT 1
- #define LEX_COMMENT 2
- #define LEX_DOCTYPE 3
- #define LEX_TAG_BEGIN 4
- #define LEX_TAG_END 5
- #define LEX_ATTR_KEY 6
- #define LEX_ATTR_VAL 7
- /* Tag types */
- typedef enum _HTMLTags
- {
- tag_a = 0 /* "a" */ ,
- tag__a = 1 /* "/a" */ ,
- tag_base = 2 /* "base" */ ,
- tag_blink = 3 /* "blink" */ ,
- tag__blink = 4 /* "/blink" */ ,
- tag__body = 5 /* "/body" */ ,
- tag__html = 6 /* "/html" */ ,
- tag_noscript = 7 /* "noscript" */ ,
- tag__noscript = 8 /* "/noscript" */ ,
- tag_script = 9 /* "script" */ ,
- tag__script =10 /* "/script" */ ,
- tag_ntags =11
- }
- HTMLTags;
- /* Tag strings */
- static char *tags[]=
- {
- /* tag_a = 0 */ "a" ,
- /* tag__a = 1 */ "/a" ,
- /* tag_base = 2 */ "base" ,
- /* tag_blink = 3 */ "blink" ,
- /* tag__blink = 4 */ "/blink" ,
- /* tag__body = 5 */ "/body" ,
- /* tag__html = 6 */ "/html" ,
- /* tag_noscript = 7 */ "noscript" ,
- /* tag__noscript = 8 */ "/noscript" ,
- /* tag_script = 9 */ "script" ,
- /* tag__script =10 */ "/script"
- };
- /* Attribute types */
- typedef enum _HTMLAttributes
- {
- att_href = 0 /* "href" */ ,
- att_onblur = 1 /* "onblur" */ ,
- att_onchange = 2 /* "onchange" */ ,
- att_onclick = 3 /* "onclick" */ ,
- att_ondblclick = 4 /* "ondblclick" */ ,
- att_onfocus = 5 /* "onfocus" */ ,
- att_onkeydown = 6 /* "onkeydown" */ ,
- att_onkeypress = 7 /* "onkeypress" */ ,
- att_onload = 8 /* "onload" */ ,
- att_onmousedown = 9 /* "onmousedown" */ ,
- att_onmousemove =10 /* "onmousemove" */ ,
- att_onmouseout =11 /* "onmouseout" */ ,
- att_onmouseover =12 /* "onmouseover" */ ,
- att_onmouseup =13 /* "onmouseup" */ ,
- att_onreset =14 /* "onreset" */ ,
- att_onselect =15 /* "onselect" */ ,
- att_onsubmit =16 /* "onsubmit" */ ,
- att_onunload =17 /* "onunload" */ ,
- att_natts =18
- }
- HTMLAttributes;
- /* Attribute strings. */
- static char *attributes[]=
- {
- /* att_href = 0 */ "href" ,
- /* onblur = 1 */ "onblur" ,
- /* onchange = 2 */ "onchange" ,
- /* onclick = 3 */ "onclick" ,
- /* ondblclick = 4 */ "ondblclick" ,
- /* onfocus = 5 */ "onfocus" ,
- /* onkeydown = 6 */ "onkeydown" ,
- /* onkeypress = 7 */ "onkeypress" ,
- /* onload = 8 */ "onload" ,
- /* onmousedown = 9 */ "onmousedown" ,
- /* onmousemove =10 */ "onmousemove" ,
- /* onmouseout =11 */ "onmouseout" ,
- /* onmouseover =12 */ "onmouseover" ,
- /* onmouseup =13 */ "onmouseup" ,
- /* onreset =14 */ "onreset" ,
- /* onselect =15 */ "onselect" ,
- /* onsubmit =16 */ "onsubmit" ,
- /* onunload =17 */ "onunload" ,
- };
- /* Definitions of why the output is disabled. */
- #define DISABLE_NONE 0
- #define DISABLE_SCRIPT 1
- #define DISABLE_BLINK 2
- static void modify_html(void);
- static char *htmlmodify_yylval=NULL;
- extern int htmlmodify_yylex(void);
- /*+ The file descriptor to output to. +*/
- static int output_fd=-1;
- /*+ The add-cache-info optional footer. +*/
- static char *cache_info=NULL;
- /*+ The file descriptor that we are reading from. +*/
- static int htmlmodify_yyfd=-1;
- /*+ The base URL of this page. +*/
- static URL *baseUrl=NULL;
- /*+ The quote character used. +*/
- static char *quote="";
- /*+ Set this to disable the output. +*/
- static int disable_output=DISABLE_NONE;
- /*++++++++++++++++++++++++++++++++++++++
- Output the file with the modificatons if it is HTML, else just output.
- int client The file to write to.
- int spool The file to read from.
- URL *Url The URL that we are parsing.
- ++++++++++++++++++++++++++++++++++++++*/
- void OutputHTMLWithModifications(int client,int spool,URL *Url)
- {
- static int first=1;
- if(AddCacheInfo)
- {
- struct stat buf;
- time_t t_ago;
- char *date,*timeunit,timeago[8];
- fstat(spool,&buf);
- t_ago=time(NULL)-buf.st_mtime;
- date=RFC822Date(buf.st_mtime,0);
-
- if(t_ago<0)
- {strcpy(timeago,"?");timeunit="";}
- else if(t_ago<3600)
- {sprintf(timeago,"%ld",t_ago/60);timeunit="m";}
- else if(t_ago<(24*3600))
- {sprintf(timeago,"%ld",t_ago/3600);timeunit="h";}
- else if(t_ago<(14*24*3600))
- {sprintf(timeago,"%ld",t_ago/(24*3600));timeunit="d";}
- else if(t_ago<(30*24*3600))
- {sprintf(timeago,"%ld",t_ago/(7*24*3600));timeunit="w";}
- else
- {sprintf(timeago,"%ld",t_ago/(30*24*3600));timeunit="M";}
- cache_info=HTMLMessageBody(-1,"AddCacheInfo",
- "url",Url->name,
- "date",date,
- "time",timeago,
- "unit",timeunit,
- NULL);
- }
- baseUrl=Url;
- output_fd=client;
- htmlmodify_yyfd=spool;
- if(!first)
- htmlmodify_yyrestart(NULL);
- modify_html();
- if(cache_info)
- free(cache_info);
- cache_info=NULL;
- first=0;
- }
- /*+ A macro to output the data if valid to do so. +*/
- #define YY_OUTPUT(text)
- if(!disable_output)
- write_string(output_fd,text)
- /*++++++++++++++++++++++++++++++++++++++
- Modify the HTML looking for all of the things to be changed.
- ++++++++++++++++++++++++++++++++++++++*/
- static void modify_html(void)
- {
- HTMLTags tag=tag_ntags;
- HTMLAttributes key=att_natts;
- int url_cached=0;
- int yychar;
- int disable_key_val;
- char *key_string=NULL;
- /* The actual parser. */
- while((yychar=htmlmodify_yylex()))
- switch(yychar)
- {
- case LEX_PLAINTEXT:
- break;
- case LEX_COMMENT:
- break;
- case LEX_DOCTYPE:
- break;
- case LEX_TAG_BEGIN:
- for(tag=0;tag<tag_ntags;tag++)
- if(!strcasecmp(htmlmodify_yylval,tags[tag]))
- break;
- if(tag==tag__a)
- {
- if(url_cached==1)
- {if(AnchorModifyEnd[0]) YY_OUTPUT(AnchorModifyEnd[0]);}
- else if(url_cached==2)
- {if(AnchorModifyEnd[1]) YY_OUTPUT(AnchorModifyEnd[1]);}
- else if(url_cached==-1)
- {if(AnchorModifyEnd[2]) YY_OUTPUT(AnchorModifyEnd[2]);}
- url_cached=0;
- }
- else if(tag==tag__body && cache_info)
- {YY_OUTPUT(cache_info); free(cache_info); cache_info=NULL;}
- else if(tag==tag__html && cache_info)
- {YY_OUTPUT(cache_info); free(cache_info); cache_info=NULL;}
- else if(DisableHTMLBlink && (tag==tag_blink || tag==tag__blink))
- disable_output|=DISABLE_BLINK;
- else if(DisableHTMLScript && tag==tag_script)
- disable_output|=DISABLE_SCRIPT;
- else if(DisableHTMLScript && (tag==tag_noscript || tag==tag__noscript))
- disable_output|=DISABLE_SCRIPT;
- YY_OUTPUT("<");
- YY_OUTPUT(htmlmodify_yylval);
- break;
- case LEX_TAG_END:
- YY_OUTPUT(">");
- if(tag==tag_a)
- {
- if(url_cached==1)
- {if(AnchorModifyBegin[0]) YY_OUTPUT(AnchorModifyBegin[0]);}
- else if(url_cached==2)
- {if(AnchorModifyBegin[1]) YY_OUTPUT(AnchorModifyBegin[1]);}
- else if(url_cached==-1)
- {if(AnchorModifyBegin[2]) YY_OUTPUT(AnchorModifyBegin[2]);}
- }
- if(DisableHTMLBlink && (tag==tag_blink || tag==tag__blink))
- disable_output&=~DISABLE_BLINK;
- else if(DisableHTMLScript && tag==tag__script)
- disable_output&=~DISABLE_SCRIPT;
- else if(DisableHTMLScript && (tag==tag_noscript || tag==tag__noscript))
- disable_output&=~DISABLE_SCRIPT;
- tag=tag_ntags;
- key=att_natts;
- break;
- case LEX_ATTR_KEY:
- key_string=(char*)realloc((void*)key_string,strlen(htmlmodify_yylval)+1);
- strcpy(key_string,htmlmodify_yylval);
- for(key=0;key<att_natts;key++)
- if(!strcasecmp(htmlmodify_yylval,attributes[key]))
- break;
- break;
- case LEX_ATTR_VAL:
- disable_key_val=0;
- if(htmlmodify_yylval && key!=att_natts)
- {
- /* Links */
- if(key==att_href && tag==tag_a && *htmlmodify_yylval)
- {
- char *link=NULL,*p,oldp=0;
- URL *Url=NULL;
- for(p=htmlmodify_yylval;*p;p++)
- if(*p=='#')
- {
- oldp=*p;
- *p=0;
- break;
- }
- if(*htmlmodify_yylval)
- {
- link=LinkURL(baseUrl,htmlmodify_yylval);
- Url=SplitURL(link);
- }
- if(!Url || !Url->Protocol)
- url_cached=0;
- else if(ExistsWebpageSpoolFile(Url) || IsLocalNetHost(Url->host))
- url_cached=1;
- else if(ExistsOutgoingSpoolFile(Url))
- url_cached=2;
- else
- url_cached=-1;
- if(link!=htmlmodify_yylval)
- free(link);
- if(Url)
- FreeURL(Url);
- *p=oldp;
- }
- /* Base tag */
- else if(key==att_href && tag==tag_base)
- baseUrl=SplitURL(htmlmodify_yylval);
- /* Script events */
- else if(DisableHTMLScript &&
- (key==att_onblur || key==att_onchange || key==att_onclick || key==att_ondblclick || key==att_onfocus ||
- key==att_onkeydown || key==att_onkeypress || key==att_onload || key==att_onmousedown ||
- key==att_onmousemove || key==att_onmouseout || key==att_onmouseover || key==att_onmouseup ||
- key==att_onreset || key==att_onselect || key==att_onsubmit || key==att_onunload))
- disable_key_val=1;
- }
- if(!disable_key_val)
- {
- YY_OUTPUT(key_string);
- if(htmlmodify_yylval)
- {
- YY_OUTPUT("=");
- if(*quote)
- YY_OUTPUT(quote);
- YY_OUTPUT(htmlmodify_yylval);
- if(*quote)
- YY_OUTPUT(quote);
- }
- }
- key=att_natts;
- break;
- default:
- }
- if(cache_info)
- {YY_OUTPUT(cache_info); free(cache_info); cache_info=NULL;}
- if(key_string)
- free(key_string);
- }
- #ifndef htmlmodify_yywrap
- /*+ Needed in lex but does nothing. +*/
- #define htmlmodify_yywrap() 1
- #endif
- /*+ Reset the current string. +*/
- #define reset_string
- *string=0;
- stringused=0;
- /*+ append information to the current string. +*/
- #define append_string(xx)
- newlen=strlen(xx);
- if((stringused+newlen)>=stringlen)
- string=(char*)realloc((void*)string,stringlen=(stringused+newlen+1));
- strcpy(string+stringused,xx);
- stringused+=newlen;
- /*+ A macro to read data that can be used by the lexer. +*/
- #define YY_INPUT(buf,result,max_size)
- if((result=read_data(htmlmodify_yyfd,buf,max_size))==-1)
- result=0;
- %}
- %%
- char *string=malloc(128);
- int stringlen=128,stringused=0,newlen;
- /* Handle comments and other tags */
- [^<]+ { YY_OUTPUT(htmlmodify_yytext); /* htmlmodify_yylval=htmlmodify_yytext; return(LEX_PLAINTEXT); */ }
- "<!DOCTYPE" { YY_OUTPUT(htmlmodify_yytext); BEGIN(DOCTYPE); reset_string; }
- "<!--" { YY_OUTPUT(htmlmodify_yytext); BEGIN(COMMENT); reset_string; }
- "<!"{W}*"-"* { YY_OUTPUT(htmlmodify_yytext); BEGIN(COMMENT_BAD); reset_string; }
- "<"{W}* { BEGIN(TAG_START); reset_string; append_string(htmlmodify_yytext); }
- /* Doctype (DTD) */
- <DOCTYPE>">" { YY_OUTPUT(htmlmodify_yytext); BEGIN(INITIAL); /* htmlmodify_yylval=string; return(LEX_DOCTYPE); */ }
- <DOCTYPE>[^>]+ { YY_OUTPUT(htmlmodify_yytext); /* append_string(htmlmodify_yytext); */ }
- /* Comments - COMMENT_BAD is not a legal comment format (except <!>) but people use it as one.
- COMMENT is not strictly correct, but works better than the real thing. */
- <COMMENT>"--"{W}*">" { YY_OUTPUT(htmlmodify_yytext); BEGIN(INITIAL); /* htmlmodify_yylval=string; return(LEX_COMMENT); */ }
- <COMMENT>">" { YY_OUTPUT(htmlmodify_yytext); /* append_string(htmlmodify_yytext); */ }
- <COMMENT>"-" { YY_OUTPUT(htmlmodify_yytext); /* append_string(htmlmodify_yytext); */ }
- <COMMENT>[^->]+ { YY_OUTPUT(htmlmodify_yytext); /* append_string(htmlmodify_yytext); */ }
- <COMMENT_BAD>">" { YY_OUTPUT(htmlmodify_yytext); BEGIN(INITIAL); /* htmlmodify_yylval=string; return(LEX_COMMENT); */ }
- <COMMENT_BAD>[^>]+ { YY_OUTPUT(htmlmodify_yytext); /* append_string(htmlmodify_yytext); */ }
- /* Tags */
- <TAG_START>"/"?{K}+/{W} { BEGIN(TAG); htmlmodify_yylval=htmlmodify_yytext; return(LEX_TAG_BEGIN); }
- <TAG_START>"/"?{K}+/">" { BEGIN(TAG); htmlmodify_yylval=htmlmodify_yytext; return(LEX_TAG_BEGIN); }
- <TAG_START>(.|n) { BEGIN(INITIAL); YY_OUTPUT(string); YY_OUTPUT(htmlmodify_yytext); }
- <TAG>">" { BEGIN(INITIAL); htmlmodify_yylval=""; return(LEX_TAG_END); }
- <TAG>"<" { BEGIN(INITIAL); unput(htmlmodify_yytext[0]); htmlmodify_yylval=""; return(LEX_TAG_END); }
- <TAG>{K}+ { BEGIN(TAG_ATTR_KEY); htmlmodify_yylval=htmlmodify_yytext; return(LEX_ATTR_KEY); }
- <TAG>(.|n) { YY_OUTPUT(htmlmodify_yytext); }
- <TAG_ATTR_KEY>{W}*= { BEGIN(TAG_ATTR_VAL); }
- <TAG_ATTR_KEY>(.|n) { BEGIN(TAG); unput(htmlmodify_yytext[0]); htmlmodify_yylval=NULL; return(LEX_ATTR_VAL); }
- <TAG_ATTR_VAL>" { BEGIN(DQUOTED); reset_string; }
- <TAG_ATTR_VAL>' { BEGIN(SQUOTED); reset_string; }
- <TAG_ATTR_VAL>{W}+ { }
- <TAG_ATTR_VAL>{F}+ { BEGIN(TAG); htmlmodify_yylval=htmlmodify_yytext; quote=""; return(LEX_ATTR_VAL); }
- <TAG_ATTR_VAL>(.|n) { BEGIN(TAG); unput(htmlmodify_yytext[0]); htmlmodify_yylval=""; quote=""; return(LEX_ATTR_VAL); }
- /* Quoted strings */
- <DQUOTED>\" { append_string(htmlmodify_yytext); }
- <DQUOTED>\ { append_string(htmlmodify_yytext); }
- <DQUOTED>" { BEGIN(TAG); htmlmodify_yylval=string; quote="""; return(LEX_ATTR_VAL); }
- <DQUOTED>(r|n)+ { }
- <DQUOTED>[^\"rn]+ { append_string(htmlmodify_yytext); }
- <SQUOTED>\' { append_string(htmlmodify_yytext); }
- <SQUOTED>\ { append_string(htmlmodify_yytext); }
- <SQUOTED>' { BEGIN(TAG); htmlmodify_yylval=string; quote="'"; return(LEX_ATTR_VAL); }
- <SQUOTED>(r|n)+ { }
- <SQUOTED>[^\'rn]+ { append_string(htmlmodify_yytext); }
- /* End of file */
- <<EOF>> { free(string); BEGIN(INITIAL); return(0); }
- %%