THTMLParser v1.0

THTMLParser is a delphi class to parse a HTML file. The file will be split into tags, text and comment objects (useful for validating tags or for automatic code corrections). Sample file included (very simple web browser!). Supports HTML4.0

How to use the HTMLParser
Create an instance of HTMLParser with

  HTMLParser:=THTMLParser.Create;

then load a HTML file, e.g.

  HTMLParser.Memory.LoadfromFile(filename)

whereas Memory is a normal TMemoryStream.
With

  HTMLParser.Execute;

the file will be parsed into HTMLParser.Parsed
this TList consists of objects derived from 3 classes: THTMLText, THTMLTag and THTMLComment which defined as following:

  THTMLItem = class
  private
    fPosition: Integer;
    fLength: Integer;

    function GetItem: String;
    procedure SetItem(Const Position,Length: Integer);
  end;

  THTMLParam = class
  private
    fRaw: THTMLItem;
    fKey: THTMLItem;
    fValue: THTMLItem;

    function GetRaw: String;
    function GetKey: String;
    function GetValue: String;
  public
    constructor Create;
    destructor Destroy; override;
  published
    property Key: String read GetKey;
    property Value: String read GetValue;
    property Raw: String read GetRaw;
  end;

  THTMLTag = class
  private
    fOnHTMLParseError: TOnHTMLParseError;
    fName: THTMLItem;
    fRaw: THTMLItem;

    function GetName: String;
    function GetRaw: String;
    procedure SetName(const Position,Length: Integer);
  public
    Params:TList; //Maybe is nil !!!

    constructor Create;
    destructor Destroy; override;
  published
    property Name: String read GetName; // uppercased TAG (without <>)
    property Raw: String read GetRaw; // raw TAG (parameters included) as read from input file (without<>)
  end;

  THTMLText = class(THTMLItem)
  private
    function GetText: String;
  published
    property Text: String read GetText;
  end;

  THTMLComment = class(THTMLItem)
  private
    function GetComment: String;
  published
    property Comment: String read GetComment;
  end;

  THTMLParser = class(TObject)
  private
    fOnHTMLParseError: TOnHTMLParseError;
    LastTagName: String;
    LTPos,GTPos,LastGTPos: Integer;

    procedure Init;
    procedure Final;
    procedure AddText;
    procedure AddTag;
  public
    parsed:TList;
    Memory: TMemoryStream;
    constructor Create;
    destructor Destroy; override;
    procedure Execute;
  published
    property OnOnHTMLParseError: TOnHTMLParseError read fOnHTMLParseError write fOnHTMLParseError;
  end;
Example
The HTML file

<html>
<BODY LINK="#FF00FF" border=0>
Hello You &amp; Go!
</html>

will result in 4 objects (HTMLParser.Parsed.Count=4):

[0] HTMLTag.Name = "html"
           .Params.count = 0

[1] HTMLTag.Name = "BODY"
           .Params.count = 2

                  [0]  HTMLParam.Key   = "LINK"
                                .Value = "#FF00FF"
                  [1]  HTMLParam.Key   = "border"
                                .Value = "0"

[2] HTMLText.Line = "Hello You &amp; Go!"

[3] HTMLTag.Name = "/html"
           .Params.count = 0

Comments and Bugs
Please send any comments or bugs to soarowl@yeah.net.

Important!

Please do NOT report any bugs considering this WebBrowser sample!
This sample is not meant as a full HTML compatible browser, indeed it is programmed to show this help file only.
©2001.5.25 Nengwen Zhuo