htmlmodify.l
上传用户:seven77cht
上传日期:2007-01-04
资源大小:486k
文件大小:15k
源码类别:

浏览器

开发平台:

Unix_Linux

  1. W               [ trn]
  2. F               [-a-z0-9$_.!*(),%;/?:@&=+~|#]
  3. K               [a-z0-9-]
  4. %x DOCTYPE
  5. %x COMMENT COMMENT_BAD
  6. %x TAG_START TAG TAG_ATTR_KEY TAG_ATTR_VAL
  7. %x DQUOTED SQUOTED
  8. %{
  9. /***************************************
  10.   $Header: /home/amb/wwwoffle/RCS/htmlmodify.l 1.14 2000/03/22 18:34:19 amb Exp $
  11.   WWWOFFLE - World Wide Web Offline Explorer - Version 2.5e.
  12.   Parse the HTML and modify the source.
  13.   ******************/ /******************
  14.   Written by Andrew M. Bishop
  15.   This file Copyright 1997,98,99,2000 Andrew M. Bishop
  16.   It may be distributed under the GNU Public License, version 2, or
  17.   any higher version.  See section COPYING of the GNU Public license
  18.   for conditions under which this file may be redistributed.
  19.   ***************************************/
  20. #include <stdlib.h>
  21. #include <string.h>
  22. #include <ctype.h>
  23. #include <sys/stat.h>
  24. #include <unistd.h>
  25. #include <time.h>
  26. #include "wwwoffle.h"
  27. #include "document.h"
  28. #include "config.h"
  29. #include "misc.h"
  30. /* Parser outputs */
  31. #define LEX_PLAINTEXT  1
  32. #define LEX_COMMENT    2
  33. #define LEX_DOCTYPE    3
  34. #define LEX_TAG_BEGIN  4
  35. #define LEX_TAG_END    5
  36. #define LEX_ATTR_KEY   6
  37. #define LEX_ATTR_VAL   7
  38. /* Tag types */
  39. typedef enum _HTMLTags
  40. {
  41.  tag_a         = 0  /* "a"         */ ,
  42.  tag__a        = 1  /* "/a"        */ ,
  43.  tag_base      = 2  /* "base"      */ ,
  44.  tag_blink     = 3  /* "blink"     */ ,
  45.  tag__blink    = 4  /* "/blink"    */ ,
  46.  tag__body     = 5  /* "/body"     */ ,
  47.  tag__html     = 6  /* "/html"     */ ,
  48.  tag_noscript  = 7  /* "noscript"  */ ,
  49.  tag__noscript = 8  /* "/noscript" */ ,
  50.  tag_script    = 9  /* "script"    */ ,
  51.  tag__script   =10  /* "/script"   */ ,
  52.  tag_ntags     =11
  53. }
  54. HTMLTags;
  55. /* Tag strings */
  56. static char *tags[]=
  57. {
  58.  /* tag_a         = 0  */  "a"         ,
  59.  /* tag__a        = 1  */  "/a"        ,
  60.  /* tag_base      = 2  */  "base"      ,
  61.  /* tag_blink     = 3  */  "blink"     ,
  62.  /* tag__blink    = 4  */  "/blink"    ,
  63.  /* tag__body     = 5  */  "/body"     ,
  64.  /* tag__html     = 6  */  "/html"     ,
  65.  /* tag_noscript  = 7  */  "noscript"  ,
  66.  /* tag__noscript = 8  */  "/noscript" ,
  67.  /* tag_script    = 9  */  "script"    ,
  68.  /* tag__script   =10  */  "/script"
  69. };
  70. /* Attribute types */
  71. typedef enum _HTMLAttributes
  72. {
  73.  att_href        = 0  /* "href"        */ ,
  74.  att_onblur      = 1  /* "onblur"      */ ,
  75.  att_onchange    = 2  /* "onchange"    */ ,
  76.  att_onclick     = 3  /* "onclick"     */ ,
  77.  att_ondblclick  = 4  /* "ondblclick"  */ ,
  78.  att_onfocus     = 5  /* "onfocus"     */ ,
  79.  att_onkeydown   = 6  /* "onkeydown"   */ ,
  80.  att_onkeypress  = 7  /* "onkeypress"  */ ,
  81.  att_onload      = 8  /* "onload"      */ ,
  82.  att_onmousedown = 9  /* "onmousedown" */ ,
  83.  att_onmousemove =10  /* "onmousemove" */ ,
  84.  att_onmouseout  =11  /* "onmouseout"  */ ,
  85.  att_onmouseover =12  /* "onmouseover" */ ,
  86.  att_onmouseup   =13  /* "onmouseup"   */ ,
  87.  att_onreset     =14  /* "onreset"     */ ,
  88.  att_onselect    =15  /* "onselect"    */ ,
  89.  att_onsubmit    =16  /* "onsubmit"    */ ,
  90.  att_onunload    =17  /* "onunload"    */ ,
  91.  att_natts       =18
  92. }
  93. HTMLAttributes;
  94. /* Attribute strings. */
  95. static char *attributes[]=
  96. {
  97.  /* att_href      = 0 */ "href"        ,
  98.  /* onblur        = 1 */ "onblur"      ,
  99.  /* onchange      = 2 */ "onchange"    ,
  100.  /* onclick       = 3 */ "onclick"     ,
  101.  /* ondblclick    = 4 */ "ondblclick"  ,
  102.  /* onfocus       = 5 */ "onfocus"     ,
  103.  /* onkeydown     = 6 */ "onkeydown"   ,
  104.  /* onkeypress    = 7 */ "onkeypress"  ,
  105.  /* onload        = 8 */ "onload"      ,
  106.  /* onmousedown   = 9 */ "onmousedown" ,
  107.  /* onmousemove   =10 */ "onmousemove" ,
  108.  /* onmouseout    =11 */ "onmouseout"  ,
  109.  /* onmouseover   =12 */ "onmouseover" ,
  110.  /* onmouseup     =13 */ "onmouseup"   ,
  111.  /* onreset       =14 */ "onreset"     ,
  112.  /* onselect      =15 */ "onselect"    ,
  113.  /* onsubmit      =16 */ "onsubmit"    ,
  114.  /* onunload      =17 */ "onunload"    ,
  115. };
  116. /* Definitions of why the output is disabled. */
  117. #define DISABLE_NONE   0
  118. #define DISABLE_SCRIPT 1
  119. #define DISABLE_BLINK  2
  120. static void modify_html(void);
  121. static char *htmlmodify_yylval=NULL;
  122. extern int htmlmodify_yylex(void);
  123. /*+ The file descriptor to output to. +*/
  124. static int output_fd=-1;
  125. /*+ The add-cache-info optional footer. +*/
  126. static char *cache_info=NULL;
  127. /*+ The file descriptor that we are reading from. +*/
  128. static int htmlmodify_yyfd=-1;
  129. /*+ The base URL of this page. +*/
  130. static URL *baseUrl=NULL;
  131. /*+ The quote character used. +*/
  132. static char *quote="";
  133. /*+ Set this to disable the output. +*/
  134. static int disable_output=DISABLE_NONE;
  135. /*++++++++++++++++++++++++++++++++++++++
  136.   Output the file with the modificatons if it is HTML, else just output.
  137.   int client The file to write to.
  138.   int spool The file to read from.
  139.   URL *Url The URL that we are parsing.
  140.   ++++++++++++++++++++++++++++++++++++++*/
  141. void OutputHTMLWithModifications(int client,int spool,URL *Url)
  142. {
  143.  static int first=1;
  144.  if(AddCacheInfo)
  145.    {
  146.     struct stat buf;
  147.     time_t t_ago;
  148.     char *date,*timeunit,timeago[8];
  149.     fstat(spool,&buf);
  150.     t_ago=time(NULL)-buf.st_mtime;
  151.     date=RFC822Date(buf.st_mtime,0);
  152.     
  153.     if(t_ago<0)
  154.       {strcpy(timeago,"?");timeunit="";}
  155.     else if(t_ago<3600)
  156.       {sprintf(timeago,"%ld",t_ago/60);timeunit="m";}
  157.     else if(t_ago<(24*3600))
  158.       {sprintf(timeago,"%ld",t_ago/3600);timeunit="h";}
  159.     else if(t_ago<(14*24*3600))
  160.       {sprintf(timeago,"%ld",t_ago/(24*3600));timeunit="d";}
  161.     else if(t_ago<(30*24*3600))
  162.       {sprintf(timeago,"%ld",t_ago/(7*24*3600));timeunit="w";}
  163.     else
  164.       {sprintf(timeago,"%ld",t_ago/(30*24*3600));timeunit="M";}
  165.     cache_info=HTMLMessageBody(-1,"AddCacheInfo",
  166.                                "url",Url->name,
  167.                                "date",date,
  168.                                "time",timeago,
  169.                                "unit",timeunit,
  170.                                NULL);
  171.    }
  172.  baseUrl=Url;
  173.  output_fd=client;
  174.  htmlmodify_yyfd=spool;
  175.  if(!first)
  176.     htmlmodify_yyrestart(NULL);
  177.  modify_html();
  178.  if(cache_info)
  179.     free(cache_info);
  180.  cache_info=NULL;
  181.  first=0;
  182. }
  183. /*+ A macro to output the data if valid to do so. +*/
  184. #define YY_OUTPUT(text) 
  185.            if(!disable_output) 
  186.               write_string(output_fd,text)
  187. /*++++++++++++++++++++++++++++++++++++++
  188.   Modify the HTML looking for all of the things to be changed.
  189.   ++++++++++++++++++++++++++++++++++++++*/
  190. static void modify_html(void)
  191. {
  192.  HTMLTags tag=tag_ntags;
  193.  HTMLAttributes key=att_natts;
  194.  int url_cached=0;
  195.  int yychar;
  196.  int disable_key_val;
  197.  char *key_string=NULL;
  198.  /* The actual parser. */
  199.  while((yychar=htmlmodify_yylex()))
  200.     switch(yychar)
  201.       {
  202.       case LEX_PLAINTEXT:
  203.        break;
  204.       case LEX_COMMENT:
  205.        break;
  206.       case LEX_DOCTYPE:
  207.        break;
  208.       case LEX_TAG_BEGIN:
  209.        for(tag=0;tag<tag_ntags;tag++)
  210.           if(!strcasecmp(htmlmodify_yylval,tags[tag]))
  211.              break;
  212.        if(tag==tag__a)
  213.          {
  214.           if(url_cached==1)
  215.             {if(AnchorModifyEnd[0]) YY_OUTPUT(AnchorModifyEnd[0]);}
  216.           else if(url_cached==2)
  217.             {if(AnchorModifyEnd[1]) YY_OUTPUT(AnchorModifyEnd[1]);}
  218.           else if(url_cached==-1)
  219.             {if(AnchorModifyEnd[2]) YY_OUTPUT(AnchorModifyEnd[2]);}
  220.           url_cached=0;
  221.          }
  222.        else if(tag==tag__body && cache_info)
  223.          {YY_OUTPUT(cache_info); free(cache_info); cache_info=NULL;}
  224.        else if(tag==tag__html && cache_info)
  225.          {YY_OUTPUT(cache_info); free(cache_info); cache_info=NULL;}
  226.        else if(DisableHTMLBlink && (tag==tag_blink || tag==tag__blink))
  227.           disable_output|=DISABLE_BLINK;
  228.        else if(DisableHTMLScript && tag==tag_script)
  229.           disable_output|=DISABLE_SCRIPT;
  230.        else if(DisableHTMLScript && (tag==tag_noscript || tag==tag__noscript))
  231.          disable_output|=DISABLE_SCRIPT;
  232.        YY_OUTPUT("<");
  233.        YY_OUTPUT(htmlmodify_yylval);
  234.        break;
  235.       case LEX_TAG_END:
  236.        YY_OUTPUT(">");
  237.        if(tag==tag_a)
  238.          {
  239.           if(url_cached==1)
  240.             {if(AnchorModifyBegin[0]) YY_OUTPUT(AnchorModifyBegin[0]);}
  241.           else if(url_cached==2)
  242.             {if(AnchorModifyBegin[1]) YY_OUTPUT(AnchorModifyBegin[1]);}
  243.           else if(url_cached==-1)
  244.             {if(AnchorModifyBegin[2]) YY_OUTPUT(AnchorModifyBegin[2]);}
  245.          }
  246.        if(DisableHTMLBlink && (tag==tag_blink || tag==tag__blink))
  247.           disable_output&=~DISABLE_BLINK;
  248.        else if(DisableHTMLScript && tag==tag__script)
  249.           disable_output&=~DISABLE_SCRIPT;
  250.        else if(DisableHTMLScript && (tag==tag_noscript || tag==tag__noscript))
  251.           disable_output&=~DISABLE_SCRIPT;
  252.        tag=tag_ntags;
  253.        key=att_natts;
  254.        break;
  255.       case LEX_ATTR_KEY:
  256.        key_string=(char*)realloc((void*)key_string,strlen(htmlmodify_yylval)+1);
  257.        strcpy(key_string,htmlmodify_yylval);
  258.        for(key=0;key<att_natts;key++)
  259.           if(!strcasecmp(htmlmodify_yylval,attributes[key]))
  260.              break;
  261.       break;
  262.       case LEX_ATTR_VAL:
  263.        disable_key_val=0;
  264.        if(htmlmodify_yylval && key!=att_natts)
  265.          {
  266.           /* Links */
  267.           if(key==att_href && tag==tag_a && *htmlmodify_yylval)
  268.             {
  269.              char *link=NULL,*p,oldp=0;
  270.              URL *Url=NULL;
  271.              for(p=htmlmodify_yylval;*p;p++)
  272.                 if(*p=='#')
  273.                   {
  274.                    oldp=*p;
  275.                    *p=0;
  276.                    break;
  277.                   }
  278.              if(*htmlmodify_yylval)
  279.                {
  280.                 link=LinkURL(baseUrl,htmlmodify_yylval);
  281.                 Url=SplitURL(link);
  282.                }
  283.              if(!Url || !Url->Protocol)
  284.                 url_cached=0;
  285.              else if(ExistsWebpageSpoolFile(Url) || IsLocalNetHost(Url->host))
  286.                 url_cached=1;
  287.              else if(ExistsOutgoingSpoolFile(Url))
  288.                 url_cached=2;
  289.              else
  290.                 url_cached=-1;
  291.              if(link!=htmlmodify_yylval)
  292.                 free(link);
  293.              if(Url)
  294.                 FreeURL(Url);
  295.              *p=oldp;
  296.             }
  297.           /* Base tag */
  298.           else if(key==att_href && tag==tag_base)
  299.              baseUrl=SplitURL(htmlmodify_yylval);
  300.           /* Script events */
  301.           else if(DisableHTMLScript &&
  302.                   (key==att_onblur || key==att_onchange || key==att_onclick || key==att_ondblclick || key==att_onfocus ||
  303.                    key==att_onkeydown || key==att_onkeypress || key==att_onload || key==att_onmousedown ||
  304.                    key==att_onmousemove || key==att_onmouseout || key==att_onmouseover || key==att_onmouseup ||
  305.                    key==att_onreset || key==att_onselect || key==att_onsubmit || key==att_onunload))
  306.              disable_key_val=1;
  307.          }
  308.        if(!disable_key_val)
  309.          {
  310.           YY_OUTPUT(key_string);
  311.           if(htmlmodify_yylval)
  312.             {
  313.              YY_OUTPUT("=");
  314.              if(*quote)
  315.                 YY_OUTPUT(quote);
  316.              YY_OUTPUT(htmlmodify_yylval);
  317.              if(*quote)
  318.                 YY_OUTPUT(quote);
  319.             }
  320.          }
  321.        key=att_natts;
  322.        break;
  323.       default:
  324.       }
  325.  if(cache_info)
  326.    {YY_OUTPUT(cache_info); free(cache_info); cache_info=NULL;}
  327.  if(key_string)
  328.     free(key_string);
  329. }
  330. #ifndef htmlmodify_yywrap
  331. /*+ Needed in lex but does nothing. +*/
  332. #define htmlmodify_yywrap() 1
  333. #endif
  334. /*+ Reset the current string. +*/
  335. #define reset_string 
  336.  *string=0; 
  337.  stringused=0;
  338. /*+ append information to the current string. +*/
  339. #define append_string(xx) 
  340.  newlen=strlen(xx); 
  341.  if((stringused+newlen)>=stringlen) 
  342.     string=(char*)realloc((void*)string,stringlen=(stringused+newlen+1)); 
  343.  strcpy(string+stringused,xx); 
  344.  stringused+=newlen;
  345. /*+ A macro to read data that can be used by the lexer. +*/
  346. #define YY_INPUT(buf,result,max_size) 
  347.         if((result=read_data(htmlmodify_yyfd,buf,max_size))==-1) 
  348.            result=0;
  349. %}
  350. %%
  351.  char *string=malloc(128);
  352.  int stringlen=128,stringused=0,newlen;
  353.  /* Handle comments and other tags */
  354. [^<]+                       { YY_OUTPUT(htmlmodify_yytext); /* htmlmodify_yylval=htmlmodify_yytext; return(LEX_PLAINTEXT); */ }
  355. "<!DOCTYPE"                 { YY_OUTPUT(htmlmodify_yytext); BEGIN(DOCTYPE); reset_string; }
  356. "<!--"                      { YY_OUTPUT(htmlmodify_yytext); BEGIN(COMMENT); reset_string; }
  357. "<!"{W}*"-"*                { YY_OUTPUT(htmlmodify_yytext); BEGIN(COMMENT_BAD); reset_string; }
  358. "<"{W}*                     { BEGIN(TAG_START); reset_string; append_string(htmlmodify_yytext); }
  359.  /* Doctype (DTD) */
  360. <DOCTYPE>">"                { YY_OUTPUT(htmlmodify_yytext); BEGIN(INITIAL); /* htmlmodify_yylval=string; return(LEX_DOCTYPE); */ }
  361. <DOCTYPE>[^>]+              { YY_OUTPUT(htmlmodify_yytext); /* append_string(htmlmodify_yytext); */ }
  362.  /* Comments - COMMENT_BAD is not a legal comment format (except <!>) but people use it as one.
  363.                COMMENT is not strictly correct, but works better than the real thing. */
  364. <COMMENT>"--"{W}*">"        { YY_OUTPUT(htmlmodify_yytext); BEGIN(INITIAL); /* htmlmodify_yylval=string; return(LEX_COMMENT); */ }
  365. <COMMENT>">"                { YY_OUTPUT(htmlmodify_yytext); /* append_string(htmlmodify_yytext); */ }
  366. <COMMENT>"-"                { YY_OUTPUT(htmlmodify_yytext); /* append_string(htmlmodify_yytext); */ }
  367. <COMMENT>[^->]+             { YY_OUTPUT(htmlmodify_yytext); /* append_string(htmlmodify_yytext); */ }
  368. <COMMENT_BAD>">"            { YY_OUTPUT(htmlmodify_yytext); BEGIN(INITIAL); /* htmlmodify_yylval=string; return(LEX_COMMENT); */ }
  369. <COMMENT_BAD>[^>]+          { YY_OUTPUT(htmlmodify_yytext); /* append_string(htmlmodify_yytext); */ }
  370.  /* Tags */
  371. <TAG_START>"/"?{K}+/{W}     { BEGIN(TAG); htmlmodify_yylval=htmlmodify_yytext; return(LEX_TAG_BEGIN); }
  372. <TAG_START>"/"?{K}+/">"     { BEGIN(TAG); htmlmodify_yylval=htmlmodify_yytext; return(LEX_TAG_BEGIN); }
  373. <TAG_START>(.|n)           { BEGIN(INITIAL); YY_OUTPUT(string); YY_OUTPUT(htmlmodify_yytext); }
  374. <TAG>">"                    { BEGIN(INITIAL); htmlmodify_yylval=""; return(LEX_TAG_END); }
  375. <TAG>"<"                    { BEGIN(INITIAL); unput(htmlmodify_yytext[0]); htmlmodify_yylval=""; return(LEX_TAG_END); }
  376. <TAG>{K}+                   { BEGIN(TAG_ATTR_KEY); htmlmodify_yylval=htmlmodify_yytext; return(LEX_ATTR_KEY); }
  377. <TAG>(.|n)                 { YY_OUTPUT(htmlmodify_yytext); }
  378. <TAG_ATTR_KEY>{W}*=         { BEGIN(TAG_ATTR_VAL); }
  379. <TAG_ATTR_KEY>(.|n)        { BEGIN(TAG); unput(htmlmodify_yytext[0]); htmlmodify_yylval=NULL; return(LEX_ATTR_VAL); }
  380. <TAG_ATTR_VAL>"            { BEGIN(DQUOTED); reset_string; }
  381. <TAG_ATTR_VAL>'            { BEGIN(SQUOTED); reset_string; }
  382. <TAG_ATTR_VAL>{W}+          { }
  383. <TAG_ATTR_VAL>{F}+          { BEGIN(TAG); htmlmodify_yylval=htmlmodify_yytext; quote=""; return(LEX_ATTR_VAL); }
  384. <TAG_ATTR_VAL>(.|n)        { BEGIN(TAG); unput(htmlmodify_yytext[0]); htmlmodify_yylval=""; quote=""; return(LEX_ATTR_VAL); }
  385.  /* Quoted strings */
  386. <DQUOTED>\"               { append_string(htmlmodify_yytext); }
  387. <DQUOTED>\                 { append_string(htmlmodify_yytext); }
  388. <DQUOTED>"                 { BEGIN(TAG); htmlmodify_yylval=string; quote="""; return(LEX_ATTR_VAL); }
  389. <DQUOTED>(r|n)+           { }
  390. <DQUOTED>[^\"rn]+       { append_string(htmlmodify_yytext); }
  391. <SQUOTED>\'               { append_string(htmlmodify_yytext); }
  392. <SQUOTED>\                 { append_string(htmlmodify_yytext); }
  393. <SQUOTED>'                 { BEGIN(TAG); htmlmodify_yylval=string; quote="'"; return(LEX_ATTR_VAL); }
  394. <SQUOTED>(r|n)+           { }
  395. <SQUOTED>[^\'rn]+       { append_string(htmlmodify_yytext); }
  396.  /* End of file */
  397. <<EOF>>                     { free(string); BEGIN(INITIAL); return(0); }
  398. %%