html.cpp
资源名称:htmlpars.zip [点击查看]
上传用户:zexelpump
上传日期:2007-01-04
资源大小:22k
文件大小:8k
源码类别:
WEB源码(ASP,PHP,...)
开发平台:
Visual C++
- /*
- Implement an HTML parser using IE4's IHTMLDocument2 interface.
- */
- #include <windows.h>
- #include <comdef.h>
- #include <io.h>
- #include "html.h"
- #include <iostream>
- using namespace std;
- /*
- static function used to force dynamic allocation
- */
- HTMLParser *HTMLParser::Create()
- {
- return new HTMLParser;
- }
- // constructor/destructor
- HTMLParser::HTMLParser()
- {
- HRESULT hr;
- LPCONNECTIONPOINTCONTAINER pCPC = NULL;
- LPOLEOBJECT pOleObject = NULL;
- LPOLECONTROL pOleControl = NULL;
- // initialize all the class member variables
- m_dwRef = 1; // must start at 1 for the current instance
- m_hrConnected = S_FALSE;
- m_dwCookie = 0;
- m_pMSHTML = NULL;
- m_pCP = NULL;
- m_pAnchorLinks = NULL;
- m_pImageLinks = NULL;
- // Create an instance of an dynamic HTML document
- if (FAILED(hr = CoCreateInstance( CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, (LPVOID*)&m_pMSHTML )))
- {
- goto Error;
- }
- if (FAILED(hr = m_pMSHTML->QueryInterface(IID_IOleObject, (LPVOID*)&pOleObject)))
- {
- goto Error;
- }
- hr = pOleObject->SetClientSite((IOleClientSite*)this);
- pOleObject->Release();
- if (FAILED(hr = m_pMSHTML->QueryInterface(IID_IOleControl, (LPVOID*)&pOleControl)))
- {
- goto Error;
- }
- hr = pOleControl->OnAmbientPropertyChange(DISPID_AMBIENT_DLCONTROL);
- pOleControl->Release();
- // Hook up sink to catch ready state property change
- if (FAILED(hr = m_pMSHTML->QueryInterface(IID_IConnectionPointContainer, (LPVOID*)&pCPC)))
- {
- goto Error;
- }
- if (FAILED(hr = pCPC->FindConnectionPoint(IID_IPropertyNotifySink, &m_pCP)))
- {
- goto Error;
- }
- m_hrConnected = m_pCP->Advise((LPUNKNOWN)(IPropertyNotifySink*)this, &m_dwCookie);
- Error:
- if (pCPC) pCPC->Release();
- }
- HTMLParser::~HTMLParser()
- {
- if ( m_pAnchorLinks )
- m_pAnchorLinks->Release();
- if ( m_pImageLinks )
- m_pImageLinks->Release();
- if (SUCCEEDED(m_hrConnected))
- m_pCP->Unadvise(m_dwCookie);
- if (m_pCP)
- m_pCP->Release();
- if ( m_pMSHTML )
- m_pMSHTML->Release();
- }
- STDMETHODIMP HTMLParser::QueryInterface(REFIID riid, LPVOID* ppv)
- {
- *ppv = NULL;
- if (IID_IUnknown == riid || IID_IPropertyNotifySink == riid)
- {
- *ppv = (LPUNKNOWN)(IPropertyNotifySink*)this;
- AddRef();
- return NOERROR;
- }
- else if (IID_IOleClientSite == riid)
- {
- *ppv = (IOleClientSite*)this;
- AddRef();
- return NOERROR;
- }
- else if (IID_IDispatch == riid)
- {
- *ppv = (IDispatch*)this;
- AddRef();
- return NOERROR;
- }
- else
- return E_NOTIMPL;
- }
- STDMETHODIMP_(ULONG) HTMLParser::AddRef()
- {
- return ++m_dwRef;
- }
- STDMETHODIMP_(ULONG) HTMLParser::Release()
- {
- if (--m_dwRef == 0)
- {
- delete this;
- return 0;
- }
- return m_dwRef;
- }
- STDMETHODIMP HTMLParser::OnChanged(DISPID dispID)
- {
- HRESULT hr;
- if (DISPID_READYSTATE == dispID)
- {
- VARIANT vResult = {0};
- EXCEPINFO excepInfo;
- UINT uArgErr;
- long lReadyState;
- DISPPARAMS dp = {NULL, NULL, 0, 0};
- if (SUCCEEDED(hr = m_pMSHTML->Invoke(DISPID_READYSTATE, IID_NULL, LOCALE_SYSTEM_DEFAULT,
- DISPATCH_PROPERTYGET, &dp, &vResult, &excepInfo, &uArgErr)))
- {
- lReadyState = (READYSTATE)V_I4(&vResult);
- switch (lReadyState)
- {
- case READYSTATE_UNINITIALIZED:
- case READYSTATE_LOADING:
- case READYSTATE_LOADED:
- case READYSTATE_INTERACTIVE:
- break;
- case READYSTATE_COMPLETE:
- // IE4 is finished parsing the file
- BOOL fRet = PostThreadMessage(GetCurrentThreadId(),
- WM_USER_LOAD_COMPLETE,
- (WPARAM)0,
- (LPARAM)0);
- break;
- }
- VariantClear(&vResult);
- }
- }
- return NOERROR;
- }
- STDMETHODIMP HTMLParser::Invoke(DISPID dispIdMember,
- REFIID riid,
- LCID lcid,
- WORD wFlags,
- DISPPARAMS __RPC_FAR *pDispParams,
- VARIANT __RPC_FAR *pVarResult,
- EXCEPINFO __RPC_FAR *pExcepInfo,
- UINT __RPC_FAR *puArgErr)
- {
- if (!pVarResult)
- {
- return E_POINTER;
- }
- switch(dispIdMember)
- {
- case DISPID_AMBIENT_DLCONTROL:
- // This tells IE4 that we want to download the page,
- // but we don't want to run scripts, Java applets, or
- // ActiveX controls
- V_VT(pVarResult) = VT_I4;
- V_I4(pVarResult) = DLCTL_DOWNLOADONLY |
- DLCTL_NO_SCRIPTS |
- DLCTL_NO_JAVA |
- DLCTL_NO_DLACTIVEXCTLS |
- DLCTL_NO_RUNACTIVEXCTLS;
- break;
- default:
- return DISP_E_MEMBERNOTFOUND;
- }
- return NOERROR;
- }
- BOOL HTMLParser::LoadHTMLFile(LPCSTR pcszFile)
- {
- HRESULT hr;
- LPPERSISTFILE pPF;
- IHTMLElementCollection* pColl = NULL;
- MSG msg;
- if ( !IsConnected() )
- return FALSE;
- // kill any previous links
- if ( m_pAnchorLinks )
- {
- m_pAnchorLinks->Release();
- m_pAnchorLinks = NULL;
- }
- if ( m_pImageLinks )
- {
- m_pImageLinks->Release();
- m_pImageLinks = NULL;
- }
- // avoid IE error msg box if the file does not exist
- if ( access(pcszFile, 0x00) != 0x00 )
- {
- return FALSE;
- }
- _bstr_t bstrFile(pcszFile);
- // use IPersistFile to load the HTML
- if ( SUCCEEDED(hr = m_pMSHTML->QueryInterface(IID_IPersistFile, (LPVOID*) &pPF)))
- {
- hr = pPF->Load((LPCWSTR)bstrFile, 0);
- pPF->Release();
- }
- BOOL bOK = FALSE;
- if (SUCCEEDED(hr))
- {
- while (GetMessage(&msg, NULL, 0, 0))
- {
- // notification from OnChanged
- if (WM_USER_LOAD_COMPLETE == msg.message && NULL == msg.hwnd)
- {
- bOK = TRUE;
- break;
- }
- else
- {
- DispatchMessage(&msg);
- }
- }
- }
- if ( bOK )
- {
- try
- {
- if ( FAILED(m_pMSHTML->get_links(&m_pAnchorLinks)) ||
- FAILED(m_pMSHTML->get_images(&m_pImageLinks)) )
- {
- throw exception();
- }
- }
- catch ( exception e )
- {
- if ( m_pAnchorLinks )
- {
- m_pAnchorLinks->Release();
- m_pAnchorLinks = NULL;
- }
- if ( m_pImageLinks )
- {
- m_pImageLinks->Release();
- m_pImageLinks = NULL;
- }
- bOK = FALSE;
- }
- }
- return bOK;
- }
- /*
- Get the number of links present in the current HTML file
- */
- long HTMLParser::GetLinkCount()
- {
- long lCount = 0;
- if ( m_pAnchorLinks )
- m_pAnchorLinks->get_length(&lCount);
- return lCount;
- }
- /*
- Get the number of images present in the current HTML file
- */
- long HTMLParser::GetImageCount()
- {
- long lCount = 0;
- if ( m_pImageLinks )
- m_pImageLinks->get_length(&lCount);
- return lCount;
- }
- /*
- Get the URL associated with a given link
- */
- BOOL HTMLParser::GetLinkURL(long lIndex, string &rstrURL)
- {
- if ( IsConnected() && m_pAnchorLinks )
- return GetURLFromCollection(m_pAnchorLinks, IID_IHTMLAnchorElement, lIndex, rstrURL);
- else
- return FALSE;
- }
- /*
- Get the URL associated with a given image
- */
- BOOL HTMLParser::GetImageURL(long lIndex, string &rstrURL)
- {
- if ( IsConnected() && m_pImageLinks )
- return GetURLFromCollection(m_pImageLinks, IID_IHTMLImgElement, lIndex, rstrURL);
- else
- return FALSE;
- }
- /*
- Get the URL associated with an element in a collection. The element must
- be an image or an anchor.
- */
- BOOL HTMLParser::GetURLFromCollection(IHTMLElementCollection *pCollection, REFIID rIID, long lIndex, string &rstrURL)
- {
- VARIANT varIndex;
- VARIANT var2;
- HRESULT hr;
- IDispatch* pDisp = NULL;
- BOOL bFound = FALSE;
- varIndex.vt = VT_UINT;
- varIndex.lVal = lIndex;
- VariantInit( &var2 );
- hr = pCollection->raw_item( varIndex, var2, &pDisp );
- if ( SUCCEEDED(hr) && pDisp)
- {
- IHTMLImgElement* pImgElem = NULL;
- IHTMLAnchorElement* pAnchorElem = NULL;
- BSTR bstr = NULL;
- if ( rIID == IID_IHTMLImgElement &&
- SUCCEEDED(pDisp->QueryInterface(rIID, (void **)&pImgElem)) )
- {
- pImgElem->get_href(&bstr);
- pImgElem->Release();
- bFound = (bstr != NULL);
- }
- else if ( rIID == IID_IHTMLAnchorElement &&
- SUCCEEDED(pDisp->QueryInterface(rIID, (void **)&pAnchorElem)) )
- {
- pAnchorElem->get_href(&bstr);
- pAnchorElem->Release();
- bFound = (bstr != NULL);
- }
- pDisp->Release();
- if ( bFound && bstr )
- {
- // _bstr_t wrapper will delete since fCopy is FALSE
- _bstr_t bstrHREF(bstr, FALSE);
- rstrURL = (LPCSTR)bstrHREF;
- }
- }
- return bFound;
- }