html.l
上传用户:seven77cht
上传日期:2007-01-04
资源大小:486k
文件大小:21k
源码类别:

浏览器

开发平台:

Unix_Linux

  1. W               [ trn]
  2. F               [-a-z0-9$_.!*(),%;/?:@&=+~|#]
  3. K               [a-z0-9-]
  4. %x DOCTYPE
  5. %x COMMENT COMMENT_BAD
  6. %x TAG_START TAG TAG_ATTR_KEY TAG_ATTR_VAL
  7. %x DQUOTED SQUOTED
  8. %{
  9. /***************************************
  10.   $Header: /home/amb/wwwoffle/RCS/html.l 2.48 2000/03/14 19:12:21 amb Exp $
  11.   WWWOFFLE - World Wide Web Offline Explorer - Version 2.5e.
  12.   Parse the HTML and look for the images, links and other things.
  13.   ******************/ /******************
  14.   Written by Andrew M. Bishop
  15.   Object handling by Walter Pfannenmller
  16.   This file Copyright 1997,98,99,2000 Andrew M. Bishop
  17.   It may be distributed under the GNU Public License, version 2, or
  18.   any higher version.  See section COPYING of the GNU Public license
  19.   for conditions under which this file may be redistributed.
  20.   ***************************************/
  21. #include <stdlib.h>
  22. #include <string.h>
  23. #include <ctype.h>
  24. #include <sys/stat.h>
  25. #include <unistd.h>
  26. #include <time.h>
  27. #include "wwwoffle.h"
  28. #include "document.h"
  29. #include "config.h"
  30. #include "misc.h"
  31. #include "errors.h"
  32. /* Parser outputs */
  33. #define LEX_PLAINTEXT  1
  34. #define LEX_COMMENT    2
  35. #define LEX_DOCTYPE    3
  36. #define LEX_TAG_BEGIN  4
  37. #define LEX_TAG_END    5
  38. #define LEX_ATTR_KEY   6
  39. #define LEX_ATTR_VAL   7
  40. /* Tag types */
  41. typedef enum _HTMLTags
  42. {
  43.  tag_a         = 0  /* "a"          */ ,
  44.  tag_applet    = 1  /* "applet"     */ ,
  45.  tag_area      = 2  /* "area"       */ ,
  46.  tag_base      = 3  /* "base"       */ ,
  47.  tag_blockquote= 4  /* "blockquote" */ ,
  48.  tag_body      = 5  /* "body"       */ ,
  49.  tag_del       = 6  /* "del"        */ ,
  50.  tag_embed     = 7  /* "embed"      */ ,
  51.  tag_frame     = 8  /* "frame"      */ ,
  52.  tag_head      = 9  /* "head"       */ ,
  53.  tag_iframe    =10  /* "iframes"    */ ,
  54.  tag_img       =11  /* "img"        */ ,
  55.  tag_input     =12  /* "input"      */ ,
  56.  tag_ins       =13  /* "ins"        */ ,
  57.  tag_link      =14  /* "link"       */ ,
  58.  tag_meta      =15  /* "meta"       */ ,
  59.  tag_object    =16  /* "object"     */ ,
  60.  tag_param     =17  /* "param"      */ ,
  61.  tag_q         =18  /* "q"          */ ,
  62.  tag_script    =19  /* "script"     */ ,
  63.  tag_xml       =20  /* "xml"        */ ,
  64.  tag_ntags     =21
  65. }
  66. HTMLTags;
  67. /* Tag strings */
  68. static char *tags[]=
  69. {
  70.  /* tag_a         = 0  */  "a"          ,
  71.  /* tag_applet    = 1  */  "applet"     ,
  72.  /* tag_area      = 2  */  "area"       ,
  73.  /* tag_base      = 3  */  "base"       ,
  74.  /* tag_blockquote= 4  */  "blockquote" ,
  75.  /* tag_body      = 5  */  "body"       ,
  76.  /* tag_del       = 6  */  "del"        ,
  77.  /* tag_embed     = 7  */  "embed"      ,
  78.  /* tag_frame     = 8  */  "frame"      ,
  79.  /* tag_head      = 9  */  "head"       ,
  80.  /* tag_iframe    =10  */  "iframes"    ,
  81.  /* tag_img       =11  */  "img"        ,
  82.  /* tag_input     =12  */  "input"      ,
  83.  /* tag_ins       =13  */  "ins"        ,
  84.  /* tag_link      =14  */  "link"       ,
  85.  /* tag_meta      =15  */  "meta"       ,
  86.  /* tag_object    =16  */  "object"     ,
  87.  /* tag_param     =17  */  "param"      ,
  88.  /* tag_q         =18  */  "q"          ,
  89.  /* tag_script    =19  */  "script"     ,
  90.  /* tag_xml       =20  */  "xml"       
  91. };
  92. /* Attribute types */
  93. typedef enum _HTMLAttributes
  94. {
  95.  att_archive   = 0  /* "archive"    */ ,
  96.  att_background= 1  /* "background" */ ,
  97.  att_cite      = 2  /* "cite"       */ ,
  98.  att_classid   = 3  /* "classid"    */ ,
  99.  att_code      = 4  /* "code"       */ ,
  100.  att_codebase  = 5  /* "codebase"   */ ,
  101.  att_codetype  = 6  /* "codetype"   */ ,
  102.  att_content   = 7  /* "content"    */ ,
  103.  att_data      = 8  /* "data"       */ ,
  104.  att_href      = 9  /* "href"       */ ,
  105.  att_http_equiv=10  /* "http-equiv" */ ,
  106.  att_longdesc  =11  /* "longdesc"   */ ,
  107.  att_name      =12  /* "name"       */ ,
  108.  att_object    =13  /* "object"     */ ,
  109.  att_profile   =14  /* "profile"    */ ,
  110.  att_rel       =15  /* "rel"        */ ,
  111.  att_src       =16  /* "src"        */ ,
  112.  att_type      =17  /* "type"       */ ,
  113.  att_usemap    =18  /* "usemap"     */ ,
  114.  att_value     =19  /* "value"      */ ,
  115.  att_valuetype =20  /* "valuetype"  */ ,
  116.  att_natts     =21
  117. }
  118. HTMLAttributes;
  119. /* Attribute strings. */
  120. static char *attributes[]=
  121. {
  122.  /* att_archive   = 0 */  "archive"     ,
  123.  /* att_background= 1 */  "background"  ,
  124.  /* att_cite      = 2 */  "cite"        ,
  125.  /* att_classid   = 3 */  "classid"     ,
  126.  /* att_code      = 4 */  "code"        ,
  127.  /* att_codebase  = 5 */  "codebase"    ,
  128.  /* att_codetype  = 6 */  "codetype"    ,
  129.  /* att_content   = 7 */  "content"     ,
  130.  /* att_data      = 8 */  "data"        ,
  131.  /* att_href      = 9 */  "href"        ,
  132.  /* att_http_equiv=10 */  "http-equiv"  ,
  133.  /* att_longdesc  =11 */  "longdesc"    ,
  134.  /* att_name      =12 */  "name"        ,
  135.  /* att_object    =13 */  "object"      ,
  136.  /* att_profile   =14 */  "profile"     ,
  137.  /* att_rel       =15 */  "rel"         ,
  138.  /* att_src       =16 */  "src"         ,
  139.  /* att_type      =17 */  "type"        ,
  140.  /* att_usemap    =18 */  "usemap"      ,
  141.  /* att_value     =19 */  "value"       ,
  142.  /* att_valuetype =20 */  "valuetype"   ,
  143. };
  144. static void parse_html(void);
  145. static char *html_yylval=NULL;
  146. extern int html_yylex(void);
  147. /*+ The refresh content of a Meta tag. +*/
  148. static char *meta_refresh=NULL;
  149. /*+ The content of a Base tag. +*/
  150. static char *base_url=NULL;
  151. /*+ The file descriptor that we are reading from. +*/
  152. static int html_yyfd=-1;
  153. /*+ The base URL of this page. +*/
  154. static URL *baseUrl=NULL;
  155. /*+ The quote character used. +*/
  156. static char *quote="";
  157. /*++++++++++++++++++++++++++++++++++++++
  158.   Parse the HTML and look for references to image/links/frames.
  159.   int fd The file descriptor of the file to parse.
  160.   URL *Url The reference URL to use.
  161.   ++++++++++++++++++++++++++++++++++++++*/
  162. void ParseHTML(int fd,URL *Url)
  163. {
  164.  static int first=1;
  165.  PrintMessage(Debug,"Parsing document using HTML parser.");
  166.  baseUrl=Url;
  167.  base_url=NULL;
  168.  SetBaseURL(baseUrl);
  169.  if(meta_refresh)
  170.     free(meta_refresh);
  171.  meta_refresh=NULL;
  172.  html_yyfd=fd;
  173.  if(!first)
  174.     html_yyrestart(NULL);
  175.  parse_html();
  176.  if(base_url)
  177.    {
  178.     baseUrl=SplitURL(base_url);
  179.     SetBaseURL(baseUrl);
  180.    }
  181.  first=0;
  182. }
  183. /*++++++++++++++++++++++++++++++++++++++
  184.   Return the URL from the Meta Refresh tag if there is one.
  185.   char *MetaRefresh Returns the new URL or NULL if none.
  186.   ++++++++++++++++++++++++++++++++++++++*/
  187. char *MetaRefresh(void)
  188. {
  189.  if(meta_refresh)
  190.    {
  191.     char *new=LinkURL(baseUrl,meta_refresh);
  192.     if(new!=meta_refresh)
  193.       {
  194.        free(meta_refresh);
  195.        meta_refresh=new;
  196.       }
  197.    }
  198.  return(meta_refresh);
  199. }
  200. /*++++++++++++++++++++++++++++++++++++++
  201.   Object and Param treatment:
  202.   this is the attempt to extract all 
  203.   valid URIs from the OBJECT or PARAM tag
  204.   defined in HTML 4.0.
  205.   there is a problem with inline data, classes, ..
  206.   or
  207.   URIs of the form
  208.   java:...
  209.   data:...
  210.   clsid:...
  211.   and inline data
  212.  
  213.   these will be taken care of later.
  214.   (c) Walter Pfannenmueller
  215.   ++++++++++++++++++++++++++++++++++++++*/
  216. /* no more than obj_archives_max are accepted: seems enough */
  217. #define obj_archives_max 32
  218. /* Object */
  219. enum {
  220.     obj_classid = 0,
  221.     obj_codetype,
  222.     obj_codebase,
  223.     obj_code,
  224.     obj_object,
  225.     obj_data,
  226.     obj_usemap,
  227.     obj_type,
  228.     obj_longdesc,
  229.     obj_archives_start,
  230.     obj_archives_end = obj_archives_start + obj_archives_max,
  231.     obj_parts_size
  232. };
  233. static int obj_codetype_Type = RefObject; 
  234. static int obj_type_Type = RefObject; 
  235. static int obj_narchives = 0;
  236. static char *obj_parts[obj_parts_size] = { NULL, }; 
  237. /* Param */
  238. enum {
  239.    param_type = 0,
  240.    param_value,
  241.    param_parts_size
  242. };
  243. static int param_valuetype_is_ref = 0;
  244. static char *param_parts[param_parts_size] = { NULL, }; 
  245. /*++++++++++++++++++++++++++++++++++++++
  246.   Free one of the obj parts.
  247.   ++++++++++++++++++++++++++++++++++++++*/
  248. static void op_free(char **op)
  249. {
  250.     if(*op != NULL)
  251.     {
  252.        free(*op);
  253.        *op = NULL;
  254.     }
  255. }
  256. /*++++++++++++++++++++++++++++++++++++++
  257.   Allocate one of the obj parts.
  258.   ++++++++++++++++++++++++++++++++++++++*/
  259. static void op_malloc(char **op,char *text)
  260. {
  261.     op_free(op);
  262.     *op = (char *)malloc(strlen(text) + 1);
  263.     strcpy(*op,text); 
  264. /*++++++++++++++++++++++++++++++++++++++
  265.   take the object's info and add codebase
  266.   ++++++++++++++++++++++++++++++++++++++*/
  267. static void codebase_url(int part, RefType refType)
  268. {
  269.     if(obj_parts[part])
  270.     {
  271.         if(obj_parts[obj_codebase])
  272.         {
  273.             char *url = (char *)malloc(strlen(obj_parts[obj_codebase]) + 
  274.                                        sizeof("/") +
  275.                                        strlen(obj_parts[part])+1);
  276.             strcpy(url,obj_parts[obj_codebase]); 
  277.             if(url[strlen(url) - 1] != '/')
  278.             {
  279.                 strcat(url,"/"); 
  280.             }
  281.             strcat(url,obj_parts[part]);
  282.             op_free(&obj_parts[part]);
  283.             obj_parts[part] = url;
  284.         }
  285.         AddReference(obj_parts[part], refType);
  286.     }
  287. }
  288. /*++++++++++++++++++++++++++++++++++++++
  289.   take the object's info and build an applet url
  290.   ++++++++++++++++++++++++++++++++++++++*/
  291. static void java_applet_url(int part, RefType refType)
  292. {
  293.     if(obj_parts[part])
  294.     {
  295.         char *dots;
  296.         char *applet = obj_parts[part];
  297.         static const char class_suffix[] = ".class";
  298.         if(strcmp(&applet[strlen(applet) - strlen(class_suffix)],class_suffix))
  299.         {
  300.             applet = (char *)malloc(strlen(applet) + sizeof(class_suffix));
  301.             strcpy(applet,obj_parts[part]); 
  302.             strcat(applet,class_suffix); 
  303.             op_free(&obj_parts[part]);
  304.             obj_parts[part] = applet;
  305.         }
  306.         dots = applet;
  307.         while((dots = strchr(dots,'.')) < (applet + strlen(applet) - sizeof(class_suffix)))
  308.         {
  309.            *dots = '/';
  310.         }
  311.         codebase_url(part,refType);
  312.     }
  313. }
  314. /*++++++++++++++++++++++++++++++++++++++
  315.   take the object's info and build urls
  316.   ++++++++++++++++++++++++++++++++++++++*/
  317. static void build_obj_urls()
  318. {
  319.     int i;
  320.     /* asuming, referenced objects from within objects
  321.        are already in the archives */ 
  322.     RefType refType = obj_narchives > 0 ? RefObject : RefInlineObject;
  323.     if(obj_codetype_Type == RefImage || obj_type_Type == RefImage)
  324.     {
  325.         codebase_url(obj_classid,RefImage);
  326.         codebase_url(obj_data,RefImage);
  327.     }
  328.     else
  329.     {
  330.         /* only change classid,data to .class - file if we are sure
  331.            we do have a java object */ 
  332.         if(obj_parts[obj_codetype] &&
  333.            !strcmp(obj_parts[obj_codetype],"application/java")
  334.         )
  335.         {
  336.            java_applet_url(obj_classid,refType);
  337.            java_applet_url(obj_data,refType);
  338.         }
  339.         else
  340.         {
  341.            codebase_url(obj_classid,refType);
  342.            codebase_url(obj_data,refType);
  343.         }    
  344.     }
  345.     java_applet_url(obj_code,refType);
  346.     java_applet_url(obj_object,refType);
  347.     codebase_url(obj_usemap,RefLink);
  348.     codebase_url(obj_longdesc,RefLink);
  349.     for(i = 0; i < obj_narchives; i++)
  350.     {
  351.         codebase_url(i + obj_archives_start, RefObject);
  352.     }
  353.     for(i = 0; i < obj_parts_size; i++)
  354.     {
  355.        op_free(&obj_parts[i]);
  356.     }
  357.     obj_codetype_Type = RefObject; 
  358.     obj_type_Type = RefObject; 
  359.     obj_narchives = 0;
  360. }
  361. /*+++++++++++++++++++++++++++++++++++++++++
  362.   take the param's info and build urls
  363.   +++++++++++++++++++++++++++++++++++++++++*/
  364. static void build_param_urls()
  365. {
  366.     int i;
  367.     if(param_valuetype_is_ref && param_parts[param_value])
  368.     {
  369.         AddReference(param_parts[param_value],RefObject);
  370.     } 
  371.     param_valuetype_is_ref = 0;
  372.     for(i = 0; i < param_parts_size; i++)
  373.     {
  374.        op_free(&param_parts[i]);
  375.     }
  376. }
  377. /*++++++++++++++++++++++++++++++++++++++
  378.   Parse the HTML and look for references to image/links/frames.
  379.   ++++++++++++++++++++++++++++++++++++++*/
  380. static void parse_html(void)
  381. {
  382.  HTMLTags tag=tag_ntags;
  383.  HTMLAttributes key=att_natts;
  384.  RefType ref;
  385.  int link_rel_style=0,meta_http_equiv_refresh=0;
  386.  int yychar;
  387.  /* The actual parser. */
  388.  while((yychar=html_yylex()))
  389.     switch(yychar)
  390.       {
  391.       case LEX_PLAINTEXT:
  392.        break;
  393.       case LEX_COMMENT:
  394.        break;
  395.       case LEX_DOCTYPE:
  396.        break;
  397.       case LEX_TAG_BEGIN:
  398.        for(tag=0;tag<tag_ntags;tag++)
  399.           if(!strcasecmp(html_yylval,tags[tag]))
  400.              break;
  401.        break;
  402.       case LEX_TAG_END:
  403.        if(tag==tag_object || tag==tag_applet || tag==tag_embed || tag==tag_xml)
  404.           build_obj_urls();
  405.        if(tag==tag_param)
  406.           build_param_urls();
  407.        tag=tag_ntags;
  408.        key=att_natts;
  409.        link_rel_style=0,meta_http_equiv_refresh=0;
  410.        break;
  411.       case LEX_ATTR_KEY:
  412.        if(tag==tag_ntags)
  413.           break;
  414.        for(key=0;key<att_natts;key++)
  415.           if(!strcasecmp(html_yylval,attributes[key]))
  416.              break;
  417.       break;
  418.       case LEX_ATTR_VAL:
  419.        if(key==att_natts)
  420.           break;
  421.        /* Simple links and stuff */
  422.        ref=NRefTypes;
  423.        if(key==att_href && (tag==tag_a || tag==tag_area))
  424.           ref=RefLink;
  425.        else if(key==att_src && (tag==tag_input || tag==tag_img))
  426.           ref=RefImage;
  427.        else if(key==att_src && tag==tag_script)
  428.           ref=RefScript;
  429.        else if(key==att_src && (tag==tag_frame || tag==tag_iframe))
  430.           ref=RefFrame;
  431.        else if(key==att_cite && (tag==tag_q || tag==tag_blockquote || tag==tag_ins || tag==tag_del))
  432.           ref=RefLink;
  433.        else if(key==att_background && tag==tag_body)
  434.           ref=RefImage;
  435.        else if(key==att_longdesc && (tag==tag_frame || tag==tag_iframe || tag==tag_img))
  436.           ref=RefLink;
  437.        else if(key==att_usemap && (tag==tag_input || tag==tag_img))
  438.           ref=RefImage;
  439.        else if(key==att_profile && tag==tag_head)
  440.           ref=RefLink;
  441.        if(ref!=NRefTypes)
  442.          {AddReference(html_yylval,ref);break;}
  443.        /* Other simple non-reference ones. */
  444.        if(key==att_href && tag==tag_base)
  445.          {base_url=(char*)malloc(strlen(html_yylval)+1); strcpy(base_url,html_yylval); break;}
  446.        /* Some more complicated ones that depend on other attributes. */
  447.        if(tag==tag_link)
  448.           if(key==att_rel && !strncasecmp(html_yylval,"Stylesheet",10))
  449.             {link_rel_style=1;break;}
  450.           else if(key==att_href)
  451.              if(link_rel_style)
  452.                {AddReference(html_yylval,RefStyleSheet);break;}
  453.              else
  454.                {AddReference(html_yylval,RefLink);break;}
  455.        if(tag==tag_meta)
  456.          {
  457.           if(key==att_http_equiv && !strncasecmp(html_yylval,"Refresh",7))
  458.             {meta_http_equiv_refresh=1;break;}
  459.           else if(key==att_content && meta_http_equiv_refresh)
  460.             {
  461.              char *p;
  462.              /* ' *[0-9].?[0-9]* *[;,] *(URL *= *|)http://...' */
  463.              p=html_yylval;
  464.              while(isspace(*p)) p++;
  465.              if(!isdigit(*p))
  466.                 break;
  467.              while(isdigit(*p)) p++;
  468.              if(*p=='.')
  469.                {p++; while(isdigit(*p)) p++;}
  470.              while(isspace(*p)) p++;
  471.              if(*p!=';' && *p!=',')
  472.                 break;
  473.              p++;
  474.              while(isspace(*p)) p++;
  475.              if(!strncasecmp(p,"URL",3))
  476.                {
  477.                 p+=3;
  478.                 while(isspace(*p)) p++;
  479.                 if(*p!='=') break;
  480.                 p++;
  481.                 while(isspace(*p)) p++;
  482.                }
  483.              if(!*p)
  484.                 break;
  485.              meta_refresh=(char*)malloc(strlen(p)+1); strcpy(meta_refresh,p);
  486.              break;
  487.             }
  488.          }
  489.        /* Complex object type ones. */
  490.        if(tag==tag_param)
  491.          {
  492.           if(key==att_valuetype && !strcasecmp(html_yylval,"ref"))
  493.              param_valuetype_is_ref = 1;
  494.           else if(key==att_name && (!strcasecmp(html_yylval,"href") || !strcasecmp(html_yylval,"file") || !strcasecmp(html_yylval,"ref")))
  495.              param_valuetype_is_ref = 1;
  496.           else if(key==att_type)  op_malloc(&param_parts[param_type] ,html_yylval);
  497.           else if(key==att_value) op_malloc(&param_parts[param_value],html_yylval);
  498.          }
  499.        else if(tag==tag_object || tag==tag_applet || tag==tag_embed || tag==tag_xml)
  500.          {
  501.           if(key==att_src)
  502.             {AddReference(html_yylval,RefInlineObject);break;}
  503.           else if(key==att_archive)
  504.             {
  505.              char *p,*q=html_yylval;
  506.              while((p=strtok(q," trn,")))
  507.                {
  508.                 if(obj_narchives < obj_archives_max)
  509.                    op_malloc(&obj_parts[obj_archives_start + obj_narchives++],p);
  510.                 q=NULL;
  511.                }
  512.              break;
  513.             }
  514.           else if(key==att_code)     op_malloc(&obj_parts[obj_code]    ,html_yylval);
  515.           else if(key==att_object)   op_malloc(&obj_parts[obj_object]  ,html_yylval);
  516.           else if(key==att_codebase) op_malloc(&obj_parts[obj_codebase],html_yylval);
  517.           else if(key==att_data)     op_malloc(&obj_parts[obj_data]    ,html_yylval);
  518.           else if(key==att_usemap)   op_malloc(&obj_parts[obj_usemap]  ,html_yylval);
  519.           else if(key==att_longdesc) op_malloc(&obj_parts[obj_longdesc],html_yylval);
  520.           else if(key==att_classid)
  521.             {
  522.              if(!strncasecmp(html_yylval,"java:",5))       op_malloc(&obj_parts[obj_classid],html_yylval+5);
  523.              else if(!strncasecmp(html_yylval,"clsid:",6)) op_malloc(&obj_parts[obj_classid],html_yylval+6);
  524.              if(strncasecmp(html_yylval,"data:",5))        op_malloc(&obj_parts[obj_classid],html_yylval);
  525.             }
  526.           else if(key==att_codetype)
  527.             {
  528.              if(!strncasecmp(html_yylval,"image",5)) {op_malloc(&obj_parts[obj_codetype],html_yylval+5); obj_codetype_Type = RefImage;}
  529.              else                                     op_malloc(&obj_parts[obj_codetype],html_yylval);
  530.             }
  531.           else if(key==att_type)
  532.             {
  533.              if(!strncasecmp(html_yylval,"image",5)) {op_malloc(&obj_parts[obj_type],html_yylval+5); obj_type_Type = RefImage;}
  534.              else                                     op_malloc(&obj_parts[obj_type],html_yylval);
  535.             }
  536.          }
  537.        key=att_natts;
  538.        break;
  539.       default:
  540.       }
  541. }
  542. #ifndef html_yywrap
  543. /*+ Needed in lex but does nothing. +*/
  544. #define html_yywrap() 1
  545. #endif
  546. /*+ Reset the current string. +*/
  547. #define reset_string 
  548.  *string=0; 
  549.  stringused=0;
  550. /*+ append information to the current string. +*/
  551. #define append_string(xx) 
  552.  newlen=strlen(xx); 
  553.  if((stringused+newlen)>=stringlen) 
  554.     string=(char*)realloc((void*)string,stringlen=(stringused+newlen+1)); 
  555.  strcpy(string+stringused,xx); 
  556.  stringused+=newlen;
  557. /*+ A macro to read data that can be used by the lexer. +*/
  558. #define YY_INPUT(buf,result,max_size) 
  559.         if((result=read_data(html_yyfd,buf,max_size))==-1) 
  560.            result=0;
  561. %}
  562. %%
  563.  char *string=malloc(128);
  564.  int stringlen=128,stringused=0,newlen;
  565.  /* Handle comments and other tags */
  566. [^<]+                       { /* html_yylval=html_yytext; return(LEX_PLAINTEXT); */ }
  567. "<!DOCTYPE"                 { BEGIN(DOCTYPE); reset_string; }
  568. "<!--"                      { BEGIN(COMMENT); reset_string; }
  569. "<!"{W}*"-"*                { BEGIN(COMMENT_BAD); reset_string; }
  570. "<"{W}*                     { BEGIN(TAG_START); reset_string; /* append_string(html_yytext); */ }
  571.  /* Doctype (DTD) */
  572. <DOCTYPE>">"                { BEGIN(INITIAL); /* html_yylval=string; return(LEX_DOCTYPE); */ }
  573. <DOCTYPE>[^>]+              { /* append_string(html_yytext); */ }
  574.  /* Comments - COMMENT_BAD is not a legal comment format (except <!>) but people use it as one.
  575.                COMMENT is not strictly correct, but works better than the real thing. */
  576. <COMMENT>"--"{W}*">"        { BEGIN(INITIAL); /* html_yylval=string; return(LEX_COMMENT); */ }
  577. <COMMENT>">"                { /* append_string(html_yytext); */ }
  578. <COMMENT>"-"                { /* append_string(html_yytext); */ }
  579. <COMMENT>[^->]+             { /* append_string(html_yytext); */ }
  580. <COMMENT_BAD>">"            { BEGIN(INITIAL); /* html_yylval=string; return(LEX_COMMENT); */ }
  581. <COMMENT_BAD>[^>]+          { /* append_string(html_yytext); */ }
  582.  /* Tags */
  583. <TAG_START>"/"?{K}+/{W}     { BEGIN(TAG); html_yylval=html_yytext; return(LEX_TAG_BEGIN); }
  584. <TAG_START>"/"?{K}+/">"     { BEGIN(TAG); html_yylval=html_yytext; return(LEX_TAG_BEGIN); }
  585. <TAG_START>(.|n)           { BEGIN(INITIAL); }
  586. <TAG>">"                    { BEGIN(INITIAL); html_yylval=""; return(LEX_TAG_END); }
  587. <TAG>"<"                    { BEGIN(INITIAL); unput(html_yytext[0]); html_yylval=""; return(LEX_TAG_END); }
  588. <TAG>{K}+                   { BEGIN(TAG_ATTR_KEY); html_yylval=html_yytext; return(LEX_ATTR_KEY); }
  589. <TAG>(.|n)                 { }
  590. <TAG_ATTR_KEY>{W}*=         { BEGIN(TAG_ATTR_VAL); }
  591. <TAG_ATTR_KEY>(.|n)        { BEGIN(TAG); unput(html_yytext[0]); html_yylval=NULL; return(LEX_ATTR_VAL); }
  592. <TAG_ATTR_VAL>"            { BEGIN(DQUOTED); reset_string; }
  593. <TAG_ATTR_VAL>'            { BEGIN(SQUOTED); reset_string; }
  594. <TAG_ATTR_VAL>{W}+          { }
  595. <TAG_ATTR_VAL>{F}+          { BEGIN(TAG); html_yylval=html_yytext; quote=""; return(LEX_ATTR_VAL); }
  596. <TAG_ATTR_VAL>(.|n)        { BEGIN(TAG); unput(html_yytext[0]); html_yylval=""; quote=""; return(LEX_ATTR_VAL); }
  597.  /* Quoted strings */
  598. <DQUOTED>\"               { append_string(html_yytext); }
  599. <DQUOTED>\                 { append_string(html_yytext); }
  600. <DQUOTED>"                 { BEGIN(TAG); html_yylval=string; quote="""; return(LEX_ATTR_VAL); }
  601. <DQUOTED>(r|n)+           { }
  602. <DQUOTED>[^\"rn]+       { append_string(html_yytext); }
  603. <SQUOTED>\'               { append_string(html_yytext); }
  604. <SQUOTED>\                 { append_string(html_yytext); }
  605. <SQUOTED>'                 { BEGIN(TAG); html_yylval=string; quote="'"; return(LEX_ATTR_VAL); }
  606. <SQUOTED>(r|n)+           { }
  607. <SQUOTED>[^\'rn]+       { append_string(html_yytext); }
  608.  /* End of file */
  609. <<EOF>>                     { free(string); BEGIN(INITIAL); return(0); }
  610. %%