html.l
上传用户:seven77cht
上传日期:2007-01-04
资源大小:486k
文件大小:21k
- W [ trn]
- F [-a-z0-9$_.!*(),%;/?:@&=+~|#]
- K [a-z0-9-]
- %x DOCTYPE
- %x COMMENT COMMENT_BAD
- %x TAG_START TAG TAG_ATTR_KEY TAG_ATTR_VAL
- %x DQUOTED SQUOTED
- %{
- /***************************************
- $Header: /home/amb/wwwoffle/RCS/html.l 2.48 2000/03/14 19:12:21 amb Exp $
- WWWOFFLE - World Wide Web Offline Explorer - Version 2.5e.
- Parse the HTML and look for the images, links and other things.
- ******************/ /******************
- Written by Andrew M. Bishop
- Object handling by Walter Pfannenmller
- This file Copyright 1997,98,99,2000 Andrew M. Bishop
- It may be distributed under the GNU Public License, version 2, or
- any higher version. See section COPYING of the GNU Public license
- for conditions under which this file may be redistributed.
- ***************************************/
- #include <stdlib.h>
- #include <string.h>
- #include <ctype.h>
- #include <sys/stat.h>
- #include <unistd.h>
- #include <time.h>
- #include "wwwoffle.h"
- #include "document.h"
- #include "config.h"
- #include "misc.h"
- #include "errors.h"
- /* Parser outputs */
- #define LEX_PLAINTEXT 1
- #define LEX_COMMENT 2
- #define LEX_DOCTYPE 3
- #define LEX_TAG_BEGIN 4
- #define LEX_TAG_END 5
- #define LEX_ATTR_KEY 6
- #define LEX_ATTR_VAL 7
- /* Tag types */
- typedef enum _HTMLTags
- {
- tag_a = 0 /* "a" */ ,
- tag_applet = 1 /* "applet" */ ,
- tag_area = 2 /* "area" */ ,
- tag_base = 3 /* "base" */ ,
- tag_blockquote= 4 /* "blockquote" */ ,
- tag_body = 5 /* "body" */ ,
- tag_del = 6 /* "del" */ ,
- tag_embed = 7 /* "embed" */ ,
- tag_frame = 8 /* "frame" */ ,
- tag_head = 9 /* "head" */ ,
- tag_iframe =10 /* "iframes" */ ,
- tag_img =11 /* "img" */ ,
- tag_input =12 /* "input" */ ,
- tag_ins =13 /* "ins" */ ,
- tag_link =14 /* "link" */ ,
- tag_meta =15 /* "meta" */ ,
- tag_object =16 /* "object" */ ,
- tag_param =17 /* "param" */ ,
- tag_q =18 /* "q" */ ,
- tag_script =19 /* "script" */ ,
- tag_xml =20 /* "xml" */ ,
- tag_ntags =21
- }
- HTMLTags;
- /* Tag strings */
- static char *tags[]=
- {
- /* tag_a = 0 */ "a" ,
- /* tag_applet = 1 */ "applet" ,
- /* tag_area = 2 */ "area" ,
- /* tag_base = 3 */ "base" ,
- /* tag_blockquote= 4 */ "blockquote" ,
- /* tag_body = 5 */ "body" ,
- /* tag_del = 6 */ "del" ,
- /* tag_embed = 7 */ "embed" ,
- /* tag_frame = 8 */ "frame" ,
- /* tag_head = 9 */ "head" ,
- /* tag_iframe =10 */ "iframes" ,
- /* tag_img =11 */ "img" ,
- /* tag_input =12 */ "input" ,
- /* tag_ins =13 */ "ins" ,
- /* tag_link =14 */ "link" ,
- /* tag_meta =15 */ "meta" ,
- /* tag_object =16 */ "object" ,
- /* tag_param =17 */ "param" ,
- /* tag_q =18 */ "q" ,
- /* tag_script =19 */ "script" ,
- /* tag_xml =20 */ "xml"
- };
- /* Attribute types */
- typedef enum _HTMLAttributes
- {
- att_archive = 0 /* "archive" */ ,
- att_background= 1 /* "background" */ ,
- att_cite = 2 /* "cite" */ ,
- att_classid = 3 /* "classid" */ ,
- att_code = 4 /* "code" */ ,
- att_codebase = 5 /* "codebase" */ ,
- att_codetype = 6 /* "codetype" */ ,
- att_content = 7 /* "content" */ ,
- att_data = 8 /* "data" */ ,
- att_href = 9 /* "href" */ ,
- att_http_equiv=10 /* "http-equiv" */ ,
- att_longdesc =11 /* "longdesc" */ ,
- att_name =12 /* "name" */ ,
- att_object =13 /* "object" */ ,
- att_profile =14 /* "profile" */ ,
- att_rel =15 /* "rel" */ ,
- att_src =16 /* "src" */ ,
- att_type =17 /* "type" */ ,
- att_usemap =18 /* "usemap" */ ,
- att_value =19 /* "value" */ ,
- att_valuetype =20 /* "valuetype" */ ,
- att_natts =21
- }
- HTMLAttributes;
- /* Attribute strings. */
- static char *attributes[]=
- {
- /* att_archive = 0 */ "archive" ,
- /* att_background= 1 */ "background" ,
- /* att_cite = 2 */ "cite" ,
- /* att_classid = 3 */ "classid" ,
- /* att_code = 4 */ "code" ,
- /* att_codebase = 5 */ "codebase" ,
- /* att_codetype = 6 */ "codetype" ,
- /* att_content = 7 */ "content" ,
- /* att_data = 8 */ "data" ,
- /* att_href = 9 */ "href" ,
- /* att_http_equiv=10 */ "http-equiv" ,
- /* att_longdesc =11 */ "longdesc" ,
- /* att_name =12 */ "name" ,
- /* att_object =13 */ "object" ,
- /* att_profile =14 */ "profile" ,
- /* att_rel =15 */ "rel" ,
- /* att_src =16 */ "src" ,
- /* att_type =17 */ "type" ,
- /* att_usemap =18 */ "usemap" ,
- /* att_value =19 */ "value" ,
- /* att_valuetype =20 */ "valuetype" ,
- };
- static void parse_html(void);
- static char *html_yylval=NULL;
- extern int html_yylex(void);
- /*+ The refresh content of a Meta tag. +*/
- static char *meta_refresh=NULL;
- /*+ The content of a Base tag. +*/
- static char *base_url=NULL;
- /*+ The file descriptor that we are reading from. +*/
- static int html_yyfd=-1;
- /*+ The base URL of this page. +*/
- static URL *baseUrl=NULL;
- /*+ The quote character used. +*/
- static char *quote="";
- /*++++++++++++++++++++++++++++++++++++++
- Parse the HTML and look for references to image/links/frames.
- int fd The file descriptor of the file to parse.
- URL *Url The reference URL to use.
- ++++++++++++++++++++++++++++++++++++++*/
- void ParseHTML(int fd,URL *Url)
- {
- static int first=1;
- PrintMessage(Debug,"Parsing document using HTML parser.");
- baseUrl=Url;
- base_url=NULL;
- SetBaseURL(baseUrl);
- if(meta_refresh)
- free(meta_refresh);
- meta_refresh=NULL;
- html_yyfd=fd;
- if(!first)
- html_yyrestart(NULL);
- parse_html();
- if(base_url)
- {
- baseUrl=SplitURL(base_url);
- SetBaseURL(baseUrl);
- }
- first=0;
- }
- /*++++++++++++++++++++++++++++++++++++++
- Return the URL from the Meta Refresh tag if there is one.
- char *MetaRefresh Returns the new URL or NULL if none.
- ++++++++++++++++++++++++++++++++++++++*/
- char *MetaRefresh(void)
- {
- if(meta_refresh)
- {
- char *new=LinkURL(baseUrl,meta_refresh);
- if(new!=meta_refresh)
- {
- free(meta_refresh);
- meta_refresh=new;
- }
- }
- return(meta_refresh);
- }
- /*++++++++++++++++++++++++++++++++++++++
- Object and Param treatment:
- this is the attempt to extract all
- valid URIs from the OBJECT or PARAM tag
- defined in HTML 4.0.
- there is a problem with inline data, classes, ..
- or
- URIs of the form
- java:...
- data:...
- clsid:...
- and inline data
-
- these will be taken care of later.
- (c) Walter Pfannenmueller
- ++++++++++++++++++++++++++++++++++++++*/
- /* no more than obj_archives_max are accepted: seems enough */
- #define obj_archives_max 32
- /* Object */
- enum {
- obj_classid = 0,
- obj_codetype,
- obj_codebase,
- obj_code,
- obj_object,
- obj_data,
- obj_usemap,
- obj_type,
- obj_longdesc,
- obj_archives_start,
- obj_archives_end = obj_archives_start + obj_archives_max,
- obj_parts_size
- };
- static int obj_codetype_Type = RefObject;
- static int obj_type_Type = RefObject;
- static int obj_narchives = 0;
- static char *obj_parts[obj_parts_size] = { NULL, };
- /* Param */
- enum {
- param_type = 0,
- param_value,
- param_parts_size
- };
- static int param_valuetype_is_ref = 0;
- static char *param_parts[param_parts_size] = { NULL, };
- /*++++++++++++++++++++++++++++++++++++++
- Free one of the obj parts.
- ++++++++++++++++++++++++++++++++++++++*/
- static void op_free(char **op)
- {
- if(*op != NULL)
- {
- free(*op);
- *op = NULL;
- }
- }
- /*++++++++++++++++++++++++++++++++++++++
- Allocate one of the obj parts.
- ++++++++++++++++++++++++++++++++++++++*/
- static void op_malloc(char **op,char *text)
- {
- op_free(op);
- *op = (char *)malloc(strlen(text) + 1);
- strcpy(*op,text);
- }
- /*++++++++++++++++++++++++++++++++++++++
- take the object's info and add codebase
- ++++++++++++++++++++++++++++++++++++++*/
- static void codebase_url(int part, RefType refType)
- {
- if(obj_parts[part])
- {
- if(obj_parts[obj_codebase])
- {
- char *url = (char *)malloc(strlen(obj_parts[obj_codebase]) +
- sizeof("/") +
- strlen(obj_parts[part])+1);
- strcpy(url,obj_parts[obj_codebase]);
- if(url[strlen(url) - 1] != '/')
- {
- strcat(url,"/");
- }
- strcat(url,obj_parts[part]);
- op_free(&obj_parts[part]);
- obj_parts[part] = url;
- }
- AddReference(obj_parts[part], refType);
- }
- }
- /*++++++++++++++++++++++++++++++++++++++
- take the object's info and build an applet url
- ++++++++++++++++++++++++++++++++++++++*/
- static void java_applet_url(int part, RefType refType)
- {
- if(obj_parts[part])
- {
- char *dots;
- char *applet = obj_parts[part];
- static const char class_suffix[] = ".class";
- if(strcmp(&applet[strlen(applet) - strlen(class_suffix)],class_suffix))
- {
- applet = (char *)malloc(strlen(applet) + sizeof(class_suffix));
- strcpy(applet,obj_parts[part]);
- strcat(applet,class_suffix);
- op_free(&obj_parts[part]);
- obj_parts[part] = applet;
- }
- dots = applet;
- while((dots = strchr(dots,'.')) < (applet + strlen(applet) - sizeof(class_suffix)))
- {
- *dots = '/';
- }
- codebase_url(part,refType);
- }
- }
- /*++++++++++++++++++++++++++++++++++++++
- take the object's info and build urls
- ++++++++++++++++++++++++++++++++++++++*/
- static void build_obj_urls()
- {
- int i;
- /* asuming, referenced objects from within objects
- are already in the archives */
- RefType refType = obj_narchives > 0 ? RefObject : RefInlineObject;
- if(obj_codetype_Type == RefImage || obj_type_Type == RefImage)
- {
- codebase_url(obj_classid,RefImage);
- codebase_url(obj_data,RefImage);
- }
- else
- {
- /* only change classid,data to .class - file if we are sure
- we do have a java object */
- if(obj_parts[obj_codetype] &&
- !strcmp(obj_parts[obj_codetype],"application/java")
- )
- {
- java_applet_url(obj_classid,refType);
- java_applet_url(obj_data,refType);
- }
- else
- {
- codebase_url(obj_classid,refType);
- codebase_url(obj_data,refType);
- }
- }
- java_applet_url(obj_code,refType);
- java_applet_url(obj_object,refType);
- codebase_url(obj_usemap,RefLink);
- codebase_url(obj_longdesc,RefLink);
- for(i = 0; i < obj_narchives; i++)
- {
- codebase_url(i + obj_archives_start, RefObject);
- }
- for(i = 0; i < obj_parts_size; i++)
- {
- op_free(&obj_parts[i]);
- }
- obj_codetype_Type = RefObject;
- obj_type_Type = RefObject;
- obj_narchives = 0;
- }
- /*+++++++++++++++++++++++++++++++++++++++++
- take the param's info and build urls
- +++++++++++++++++++++++++++++++++++++++++*/
- static void build_param_urls()
- {
- int i;
- if(param_valuetype_is_ref && param_parts[param_value])
- {
- AddReference(param_parts[param_value],RefObject);
- }
- param_valuetype_is_ref = 0;
- for(i = 0; i < param_parts_size; i++)
- {
- op_free(¶m_parts[i]);
- }
- }
- /*++++++++++++++++++++++++++++++++++++++
- Parse the HTML and look for references to image/links/frames.
- ++++++++++++++++++++++++++++++++++++++*/
- static void parse_html(void)
- {
- HTMLTags tag=tag_ntags;
- HTMLAttributes key=att_natts;
- RefType ref;
- int link_rel_style=0,meta_http_equiv_refresh=0;
- int yychar;
- /* The actual parser. */
- while((yychar=html_yylex()))
- switch(yychar)
- {
- case LEX_PLAINTEXT:
- break;
- case LEX_COMMENT:
- break;
- case LEX_DOCTYPE:
- break;
- case LEX_TAG_BEGIN:
- for(tag=0;tag<tag_ntags;tag++)
- if(!strcasecmp(html_yylval,tags[tag]))
- break;
- break;
- case LEX_TAG_END:
- if(tag==tag_object || tag==tag_applet || tag==tag_embed || tag==tag_xml)
- build_obj_urls();
- if(tag==tag_param)
- build_param_urls();
- tag=tag_ntags;
- key=att_natts;
- link_rel_style=0,meta_http_equiv_refresh=0;
- break;
- case LEX_ATTR_KEY:
- if(tag==tag_ntags)
- break;
- for(key=0;key<att_natts;key++)
- if(!strcasecmp(html_yylval,attributes[key]))
- break;
- break;
- case LEX_ATTR_VAL:
- if(key==att_natts)
- break;
- /* Simple links and stuff */
- ref=NRefTypes;
- if(key==att_href && (tag==tag_a || tag==tag_area))
- ref=RefLink;
- else if(key==att_src && (tag==tag_input || tag==tag_img))
- ref=RefImage;
- else if(key==att_src && tag==tag_script)
- ref=RefScript;
- else if(key==att_src && (tag==tag_frame || tag==tag_iframe))
- ref=RefFrame;
- else if(key==att_cite && (tag==tag_q || tag==tag_blockquote || tag==tag_ins || tag==tag_del))
- ref=RefLink;
- else if(key==att_background && tag==tag_body)
- ref=RefImage;
- else if(key==att_longdesc && (tag==tag_frame || tag==tag_iframe || tag==tag_img))
- ref=RefLink;
- else if(key==att_usemap && (tag==tag_input || tag==tag_img))
- ref=RefImage;
- else if(key==att_profile && tag==tag_head)
- ref=RefLink;
- if(ref!=NRefTypes)
- {AddReference(html_yylval,ref);break;}
- /* Other simple non-reference ones. */
- if(key==att_href && tag==tag_base)
- {base_url=(char*)malloc(strlen(html_yylval)+1); strcpy(base_url,html_yylval); break;}
- /* Some more complicated ones that depend on other attributes. */
- if(tag==tag_link)
- if(key==att_rel && !strncasecmp(html_yylval,"Stylesheet",10))
- {link_rel_style=1;break;}
- else if(key==att_href)
- if(link_rel_style)
- {AddReference(html_yylval,RefStyleSheet);break;}
- else
- {AddReference(html_yylval,RefLink);break;}
- if(tag==tag_meta)
- {
- if(key==att_http_equiv && !strncasecmp(html_yylval,"Refresh",7))
- {meta_http_equiv_refresh=1;break;}
- else if(key==att_content && meta_http_equiv_refresh)
- {
- char *p;
- /* ' *[0-9].?[0-9]* *[;,] *(URL *= *|)http://...' */
- p=html_yylval;
- while(isspace(*p)) p++;
- if(!isdigit(*p))
- break;
- while(isdigit(*p)) p++;
- if(*p=='.')
- {p++; while(isdigit(*p)) p++;}
- while(isspace(*p)) p++;
- if(*p!=';' && *p!=',')
- break;
- p++;
- while(isspace(*p)) p++;
- if(!strncasecmp(p,"URL",3))
- {
- p+=3;
- while(isspace(*p)) p++;
- if(*p!='=') break;
- p++;
- while(isspace(*p)) p++;
- }
- if(!*p)
- break;
- meta_refresh=(char*)malloc(strlen(p)+1); strcpy(meta_refresh,p);
- break;
- }
- }
- /* Complex object type ones. */
- if(tag==tag_param)
- {
- if(key==att_valuetype && !strcasecmp(html_yylval,"ref"))
- param_valuetype_is_ref = 1;
- else if(key==att_name && (!strcasecmp(html_yylval,"href") || !strcasecmp(html_yylval,"file") || !strcasecmp(html_yylval,"ref")))
- param_valuetype_is_ref = 1;
- else if(key==att_type) op_malloc(¶m_parts[param_type] ,html_yylval);
- else if(key==att_value) op_malloc(¶m_parts[param_value],html_yylval);
- }
- else if(tag==tag_object || tag==tag_applet || tag==tag_embed || tag==tag_xml)
- {
- if(key==att_src)
- {AddReference(html_yylval,RefInlineObject);break;}
- else if(key==att_archive)
- {
- char *p,*q=html_yylval;
- while((p=strtok(q," trn,")))
- {
- if(obj_narchives < obj_archives_max)
- op_malloc(&obj_parts[obj_archives_start + obj_narchives++],p);
- q=NULL;
- }
- break;
- }
- else if(key==att_code) op_malloc(&obj_parts[obj_code] ,html_yylval);
- else if(key==att_object) op_malloc(&obj_parts[obj_object] ,html_yylval);
- else if(key==att_codebase) op_malloc(&obj_parts[obj_codebase],html_yylval);
- else if(key==att_data) op_malloc(&obj_parts[obj_data] ,html_yylval);
- else if(key==att_usemap) op_malloc(&obj_parts[obj_usemap] ,html_yylval);
- else if(key==att_longdesc) op_malloc(&obj_parts[obj_longdesc],html_yylval);
- else if(key==att_classid)
- {
- if(!strncasecmp(html_yylval,"java:",5)) op_malloc(&obj_parts[obj_classid],html_yylval+5);
- else if(!strncasecmp(html_yylval,"clsid:",6)) op_malloc(&obj_parts[obj_classid],html_yylval+6);
- if(strncasecmp(html_yylval,"data:",5)) op_malloc(&obj_parts[obj_classid],html_yylval);
- }
- else if(key==att_codetype)
- {
- if(!strncasecmp(html_yylval,"image",5)) {op_malloc(&obj_parts[obj_codetype],html_yylval+5); obj_codetype_Type = RefImage;}
- else op_malloc(&obj_parts[obj_codetype],html_yylval);
- }
- else if(key==att_type)
- {
- if(!strncasecmp(html_yylval,"image",5)) {op_malloc(&obj_parts[obj_type],html_yylval+5); obj_type_Type = RefImage;}
- else op_malloc(&obj_parts[obj_type],html_yylval);
- }
- }
- key=att_natts;
- break;
- default:
- }
- }
- #ifndef html_yywrap
- /*+ Needed in lex but does nothing. +*/
- #define html_yywrap() 1
- #endif
- /*+ Reset the current string. +*/
- #define reset_string
- *string=0;
- stringused=0;
- /*+ append information to the current string. +*/
- #define append_string(xx)
- newlen=strlen(xx);
- if((stringused+newlen)>=stringlen)
- string=(char*)realloc((void*)string,stringlen=(stringused+newlen+1));
- strcpy(string+stringused,xx);
- stringused+=newlen;
- /*+ A macro to read data that can be used by the lexer. +*/
- #define YY_INPUT(buf,result,max_size)
- if((result=read_data(html_yyfd,buf,max_size))==-1)
- result=0;
- %}
- %%
- char *string=malloc(128);
- int stringlen=128,stringused=0,newlen;
- /* Handle comments and other tags */
- [^<]+ { /* html_yylval=html_yytext; return(LEX_PLAINTEXT); */ }
- "<!DOCTYPE" { BEGIN(DOCTYPE); reset_string; }
- "<!--" { BEGIN(COMMENT); reset_string; }
- "<!"{W}*"-"* { BEGIN(COMMENT_BAD); reset_string; }
- "<"{W}* { BEGIN(TAG_START); reset_string; /* append_string(html_yytext); */ }
- /* Doctype (DTD) */
- <DOCTYPE>">" { BEGIN(INITIAL); /* html_yylval=string; return(LEX_DOCTYPE); */ }
- <DOCTYPE>[^>]+ { /* append_string(html_yytext); */ }
- /* Comments - COMMENT_BAD is not a legal comment format (except <!>) but people use it as one.
- COMMENT is not strictly correct, but works better than the real thing. */
- <COMMENT>"--"{W}*">" { BEGIN(INITIAL); /* html_yylval=string; return(LEX_COMMENT); */ }
- <COMMENT>">" { /* append_string(html_yytext); */ }
- <COMMENT>"-" { /* append_string(html_yytext); */ }
- <COMMENT>[^->]+ { /* append_string(html_yytext); */ }
- <COMMENT_BAD>">" { BEGIN(INITIAL); /* html_yylval=string; return(LEX_COMMENT); */ }
- <COMMENT_BAD>[^>]+ { /* append_string(html_yytext); */ }
- /* Tags */
- <TAG_START>"/"?{K}+/{W} { BEGIN(TAG); html_yylval=html_yytext; return(LEX_TAG_BEGIN); }
- <TAG_START>"/"?{K}+/">" { BEGIN(TAG); html_yylval=html_yytext; return(LEX_TAG_BEGIN); }
- <TAG_START>(.|n) { BEGIN(INITIAL); }
- <TAG>">" { BEGIN(INITIAL); html_yylval=""; return(LEX_TAG_END); }
- <TAG>"<" { BEGIN(INITIAL); unput(html_yytext[0]); html_yylval=""; return(LEX_TAG_END); }
- <TAG>{K}+ { BEGIN(TAG_ATTR_KEY); html_yylval=html_yytext; return(LEX_ATTR_KEY); }
- <TAG>(.|n) { }
- <TAG_ATTR_KEY>{W}*= { BEGIN(TAG_ATTR_VAL); }
- <TAG_ATTR_KEY>(.|n) { BEGIN(TAG); unput(html_yytext[0]); html_yylval=NULL; return(LEX_ATTR_VAL); }
- <TAG_ATTR_VAL>" { BEGIN(DQUOTED); reset_string; }
- <TAG_ATTR_VAL>' { BEGIN(SQUOTED); reset_string; }
- <TAG_ATTR_VAL>{W}+ { }
- <TAG_ATTR_VAL>{F}+ { BEGIN(TAG); html_yylval=html_yytext; quote=""; return(LEX_ATTR_VAL); }
- <TAG_ATTR_VAL>(.|n) { BEGIN(TAG); unput(html_yytext[0]); html_yylval=""; quote=""; return(LEX_ATTR_VAL); }
- /* Quoted strings */
- <DQUOTED>\" { append_string(html_yytext); }
- <DQUOTED>\ { append_string(html_yytext); }
- <DQUOTED>" { BEGIN(TAG); html_yylval=string; quote="""; return(LEX_ATTR_VAL); }
- <DQUOTED>(r|n)+ { }
- <DQUOTED>[^\"rn]+ { append_string(html_yytext); }
- <SQUOTED>\' { append_string(html_yytext); }
- <SQUOTED>\ { append_string(html_yytext); }
- <SQUOTED>' { BEGIN(TAG); html_yylval=string; quote="'"; return(LEX_ATTR_VAL); }
- <SQUOTED>(r|n)+ { }
- <SQUOTED>[^\'rn]+ { append_string(html_yytext); }
- /* End of file */
- <<EOF>> { free(string); BEGIN(INITIAL); return(0); }
- %%