Units UnicodeCodecs- Source http://fundementals.sourceforge.net/cUnicodeCodecs.html } { Unicode codecs v3.12 } { } { This unit is copyright c 2002-2004 by } { David J Butler and Dieter Kohler. } { } { This unit is part of Delphi Fundamentals. } { Its original file name is cUnicodeCodecs.pas } { The latest version is available from the Fundamentals home page } { http://fundementals.sourceforge.net/ } { A forum is available on SourceForge for general discussion } { http://sourceforge.net/forum/forum.php?forum_id=2117 } { } { This unit is also part of the Open XML Utility Library. } { http://www.philo.de/xml/ } { } { } { LICENSE } { } { The contents of this file are subject to the Mozilla Public License Version } { 1.1 (the "License"); you may not use this file except in compliance with } { the License. You may obtain a copy of the License at } { "http://www.mozilla.org/MPL/" } { } { Software distributed under the License is distributed on an "AS IS" basis, } { WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for } { the specific language governing rights and limitations under the License. } { } { The Original Code is "cUnicodeCodecs.pas". } { } { The Initial Developers of the Original Code are David J Butler (Pretoria, } { South Africa, "http://fundementals.sourceforge.net/") and Dieter Kohler } { (Heidelberg, Germany, "http://www.philo.de/"). Portions created by the } { Initial Developers are Copyright (C) 2002-2004 David J Butler and } { Dieter Kohler. All Rights Reserved. } { } { Alternatively, the contents of this file may be used under the terms of the } { GNU General Public License Version 2 or later (the "GPL"), in which case the } { provisions of the GPL are applicable instead of those above. If you wish to } { allow use of your version of this file only under the terms of the GPL, and } { not to allow others to use your version of this file under the terms of the } { MPL, indicate your decision by deleting the provisions above and replace } { them with the notice and other provisions required by the GPL. If you do not } { delete the provisions above, a recipient may use your version of this file } { under the terms of any one of the MPL or the GPL. } { } { } { DESCRIPTION } { } { Codecs (encoders/decoders) for Unicode text. } { } { To decode or encode Unicode text, use one of the EncodingToUTF16 or } { UTF16ToEncoding functions. } { } { For example, to convert an ISO-8859-1 string into an Unicode string: } { } { WideStr := EncodingToUTF16(TISO8859_1Codec, 'ISO-8859-1 String'); } { } { or alternatively, using an alias: } { } { WideStr := EncodingToUTF16('iso-8859-1', 'ISO-8859-1 String'); } { WideStr := EncodingToUTF16('latin1', 'ISO-8859-1 String'); } { } { } { REVISION HISTORY } { } { 17/04/2002 0.01 Initial version. ISO8859, Mac, Win1250-1252, UTF. } { 20/04/2002 0.02 EBCDIC-US. } { 28/10/2002 3.03 Refactored. } { 29/10/2002 3.04 UTF-8 string functions. } { 04/11/2002 3.05 Test cases. Fixed bug in UTF-8 encoding function. } { 23/05/2003 3.06 Detection routines. } { 28/09/2003 3.07 Renamed ASCII to USASCII for clarity. } { 30/10/2003 3.08 Moved character mappings to unit cUnicodeMaps. } { 10/01/2004 3.09 Moved generic functions to cUnicodeChar and cUnicode. } { Revision of codec classes. } { 15/03/2004 3.10 Moved character mappings into codec classes. } { UCS2 codec. } { 11/04/2004 3.11 Improved Read/Write functions. } { 19/04/2004 3.12 Small revisions. } { {$INCLUDE cDefines.inc} unit cUnicodeCodecs; interface uses { Delphi } SysUtils; const UnitName = 'cUnicodeCodecs'; UnitVersion = '3.12'; UnitCopyright = 'Copyright (c) 2002-2004 David J Butler and Dieter Kohler'; { } { UCS-4 definitions } { } {$IFDEF DELPHI5} type UCS4Char = LongWord; PUCS4Char = ^UCS4Char; {$ENDIF} const UCS4_STRING_TERMINATOR = $9C; UCS4_LF = $0A; UCS4_CR = $0D; { } { US-ASCII string functions } { } function IsUSASCIIString(const S: AnsiString): Boolean; function IsUSASCIIWideBuf(const Buf: PWideChar; const Len: Integer): Boolean; function IsUSASCIIWideString(const S: WideString): Boolean; { } { Long string conversion functions } { } procedure LongToWide(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer); function LongStringToWideString(const S: AnsiString): WideString; procedure WideToLong(const Buf: Pointer; const Len: Integer; const DestBuf: Pointer); function WideToLongString(const P: PWideChar; const Len: Integer): AnsiString; function WideStringToLongString(const S: WideString): AnsiString; { } { UTF-8 character conversion functions } { } type TUTF8Error = ( UTF8ErrorNone, UTF8ErrorInvalidEncoding, UTF8ErrorIncompleteEncoding, UTF8ErrorInvalidBuffer, UTF8ErrorOutOfRange ); function UTF8ToUCS4Char(const P: PChar; const Size: Integer; out SeqSize: Integer; out Ch: UCS4Char): TUTF8Error; function UTF8ToWideChar(const P: PChar; const Size: Integer; out SeqSize: Integer; out Ch: WideChar): TUTF8Error; procedure UCS4CharToUTF8(const Ch: UCS4Char; const Dest: Pointer; const DestSize: Integer; out SeqSize: Integer); procedure WideCharToUTF8(const Ch: WideChar; const Dest: Pointer; const DestSize: Integer; out SeqSize: Integer); { } { UTF-16 character conversion functions } { } procedure UCS4CharToUTF16BE(const Ch: UCS4Char; const Dest: Pointer; const DestSize: Integer; out SeqSize: Integer); procedure UCS4CharToUTF16LE(const Ch: UCS4Char; const Dest: Pointer; const DestSize: Integer; out SeqSize: Integer); { } { UTF-8 string functions } { } const UTF8BOMSize = 3; function DetectUTF8BOM(const P: PChar; const Size: Integer): Boolean; function UTF8CharSize(const P: PChar; const Size: Integer): Integer; function UTF8BufLength(const P: PChar; const Size: Integer): Integer; function UTF8StringLength(const S: String): Integer; function UTF8StringToWideString(const S: String): WideString; function UTF8StringToLongString(const S: String): String; function UCS4CharToUTF8CharSize(const Ch: UCS4Char): Integer; function WideBufToUTF8Size(const Buf: PWideChar; const Len: Integer): Integer; function WideStringToUTF8Size(const S: WideString): Integer; function WideBufToUTF8String(const Buf: PWideChar; const Len: Integer): String; function WideStringToUTF8String(const S: WideString): String; function LongBufToUTF8Size(const Buf: PChar; const Len: Integer): Integer; function LongStringToUTF8Size(const S: String): Integer; function LongStringToUTF8String(const S: String): String; function UCS4CharToUTF8String(const Ch: UCS4Char): String; function ISO8859_1StringToUTF8String(const S: String): String; { } { UTF-16 functions } { } const UTF16BOMSize = 2; function DetectUTF16BEBOM(const P: PChar; const Size: Integer): Boolean; function DetectUTF16LEBOM(const P: PChar; const Size: Integer): Boolean; function DetectUTF16BOM(const P: PChar; const Size: Integer; out SwapEndian: Boolean): Boolean; function SwapUTF16Endian(const P: WideChar): WideChar; { } { TCustomUnicodeCodec } { Base class for Unicode Codec implementations. } { } type TCodecErrorAction = ( eaException, // Raise an exception (default) eaStop, // Stop encoding/decoding eaIgnore, // Ignore error and continue eaSkip, // Skip character and continue eaReplace); // Replace invalid character and continue TCodecReadLFOption = ( lrPass, // No normalization takes place (default) lrNormalize); // Line breaks are adjusted to Linux-style breaks with a // single LINE FEED, i.e. a sequence of CARRIAGE RETURN // ($0D) + LINE FEED ($0A) or a single CARRIAGE RETURN is // normalized to a single LINE FEED ($0A) TCodecWriteLFOption = ( lwLF, // Transcode LINE FEED into LINE FEED (default) lwCR, // Transcode LINE FEED into CARRIAGE RETURN lwCRLF); // Transcode LINE FEED into CARRIAGE RETURN + LINE FEED TCodecReadEvent = procedure (Sender: TObject; var Buf; Count: Longint; var Ok: Boolean) of object; TCodecWriteEvent = procedure (Sender: TObject; const Buf; Count: Longint) of object; TCustomUnicodeCodec = class private FErrorAction : TCodecErrorAction; FDecodeReplaceChar : WideChar; FReadLFOption : TCodecReadLFOption; FWriteLFOption : TCodecWriteLFOption; FOnRead : TCodecReadEvent; FOnWrite : TCodecWriteEvent; FReadAhead : Boolean; // Flag used for LF input normalization FReadAheadBuffer : UCS4Char; // Buffer storage LF input normalization FReadAheadByteCount : Integer; // Buffer storage LF input normalization protected procedure ResetReadAhead; procedure SetDecodeReplaceChar(const Value: WideChar); procedure SetErrorAction(const Value: TCodecErrorAction); procedure SetReadLFOption(const Value: TCodecReadLFOption); virtual; procedure SetWriteLFOption(const Value: TCodecWriteLFOption); virtual; procedure SetOnRead(const Value: TCodecReadEvent); function ReadBuffer(var Buf; Count: Integer): Boolean; procedure WriteBuffer(const Buf; Count: Integer); procedure InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); virtual; abstract; // Implementation guidelines for derived classes: // ReadUCS4Char calls (if necessary repeatedly and sometimes // ahead) the InternalReadUCS4Char function, which must call // ReadBuffer to request buffer values. ReadBuffer calls the // OnRead event to request buffer values, similar to the Delphi // VCL TStream.Read function. // It must raise an EConvertError exception if the byte values // returned by the OnRead event contain code that cannot be // converted to a UCS-4 character or if the result value falls // into the reserved surrogate area [$D800..$DFFF]. // If ReadBuffer returns False, the UCS-4 character $9C // (STRING TERMINATOR) must be returned. // LINE FEED characters ($A) are transformed according to the // value of ReadLFOption property by the ReadUCS4Char function. procedure InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); virtual; abstract; // Implementation guideline for derived classes: // WriteUCS4Char calls (if necessary repeatedly) the // InternalWriteUCS4Char procedure, which must call WriteBuffer // to write buffer values. WriteBuffer calls the OnWrite event // to send the buffer values, similar to the Delphi VCL // TStream.Write function. // It must raise an EConvertError exception if the specified // UCS-4 character cannot be converted into the target encoding. // If no OnWrite event handler is assigned calling WriteUCS4Char // simply has no effect. // LINE FEED characters ($A) are transformed according to the // value of the WriteLFOption property by the WriteUCS4Char // procedure. public constructor Create; virtual; constructor CreateEx(const AErrorAction: TCodecErrorAction = eaException; const ADecodeReplaceChar: WideChar = WideChar(#$FFFD); const AReadLFOption: TCodecReadLFOption = lrPass; const AWriteLFOption: TCodecWriteLFOption = lwCRLF); procedure Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); virtual; abstract; function Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; virtual; abstract; procedure DecodeStr(const Buf: Pointer; const BufSize: Integer; var Dest: WideString); function EncodeStr(const S: WideString): String; procedure ReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); procedure WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); virtual; // Implementation guideline for derived clases: // WriteUCS4Char can be overridden to implement more efficient // handling of LINE FEED character transformations. property ErrorAction: TCodecErrorAction read FErrorAction write SetErrorAction default eaException; property DecodeReplaceChar: WideChar read FDecodeReplaceChar write SetDecodeReplaceChar default #$FFFD; property ReadLFOption: TCodecReadLFOption read FReadLFOption write SetReadLFOption default lrPass; property WriteLFOption: TCodecWriteLFOption read FWriteLFOption write SetWriteLFOption default lwLF; property OnRead: TCodecReadEvent read FOnRead write SetOnRead; property OnWrite: TCodecWriteEvent read FOnWrite write FOnWrite; end; TUnicodeCodecClass = class of TCustomUnicodeCodec; EUnicodeCodecException = class(Exception) ProcessedBytes : Integer; end; { } { Unicode Codec alias functions } { } function GetCodecClassByAlias(const CodecAlias: String): TUnicodeCodecClass; function GetEncodingName(const CodecClass: TUnicodeCodecClass): String; {$IFDEF OS_MSWIN} { } { Windows system encoding functions } { } function GetSystemEncodingName: String; {$IFDEF DELPHI6_UP}platform;{$ENDIF} function GetSystemEncodingCodecClass: TUnicodeCodecClass; {$IFDEF DELPHI6_UP}platform;{$ENDIF} {$ENDIF} { } { Encoding detection } { } function DetectUTFEncoding(const Buf: Pointer; const BufSize: Integer; var BOMSize: Integer): TUnicodeCodecClass; { } { Encoding conversion functions } { } function EncodingToUTF16(const CodecClass: TUnicodeCodecClass; const Buf: Pointer; const BufSize: Integer): WideString; overload; function EncodingToUTF16(const CodecClass: TUnicodeCodecClass; const S: String): WideString; overload; function EncodingToUTF16(const CodecAlias: String; const Buf: Pointer; const BufSize: Integer): WideString; overload; function EncodingToUTF16(const CodecAlias, S: String): WideString; overload; function UTF16ToEncoding(const CodecClass: TUnicodeCodecClass; const S: WideString): String; overload; function UTF16ToEncoding(const CodecAlias: String; const S: WideString): String; overload; { } { TUTF8Codec } { Unicode Codec implementation for UTF-8. } { } type TUTF8Codec = class(TCustomUnicodeCodec) protected procedure InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); override; procedure InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; public procedure Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); override; function Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; override; procedure WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; end; { } { TUTF16BECodec } { Unicode Codec implementation for UTF-16BE. } { } type TUTF16BECodec = class(TCustomUnicodeCodec) protected procedure InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); override; procedure InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; public procedure Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); override; function Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; override; procedure WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; end; { } { TUTF16LECodec } { Unicode Codec implementation for UTF-16LE. } { } type TUTF16LECodec = class(TCustomUnicodeCodec) protected procedure InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); override; procedure InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; public procedure Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); override; function Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; override; procedure WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; end; { } { TUCS4BECodec } { Unicode Codec implementation for ISO 10646 UCS-4BE. } { } type TUCS4BECodec = class(TCustomUnicodeCodec) protected procedure InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); override; procedure InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; public procedure Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); override; function Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; override; procedure WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; end; { } { TUCS4LECodec } { Unicode Codec implementation for ISO 10646 UCS-4BE. } { } type TUCS4LECodec = class(TCustomUnicodeCodec) protected procedure InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); override; procedure InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; public procedure Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); override; function Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; override; procedure WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; end; { } { TUCS4_2143Codec } { Unicode Codec implementation for ISO 10646 UCS-4BE. } { } type TUCS4_2143Codec = class(TCustomUnicodeCodec) protected procedure InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); override; procedure InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; public procedure Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); override; function Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; override; procedure WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; end; { } { TUCS4_3412Codec } { Unicode Codec implementation for ISO 10646 UCS-4BE. } { } type TUCS4_3412Codec = class(TCustomUnicodeCodec) protected procedure InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); override; procedure InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; public procedure Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); override; function Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; override; procedure WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; end; { } { TUCS2Codec } { Unicode Codec implementation for ISO 10646 UCS-2. } { } type TUCS2Codec = class(TCustomUnicodeCodec) protected procedure InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); override; procedure InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; public procedure Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); override; function Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; override; procedure WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; end; { } { TCustomSingleByteCodec } { Base class for single-byte encodings. } { } type TCustomSingleByteCodec = class(TCustomUnicodeCodec) protected FEncodeReplaceChar : AnsiChar; procedure InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); override; procedure InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); override; public constructor Create; override; constructor CreateEx(const ErrorAction: TCodecErrorAction = eaException; const DecodeReplaceChar: WideChar = WideChar(#$FFFD); const EncodeReplaceChar: AnsiChar = AnsiChar(#32)); property EncodeReplaceChar: AnsiChar read FEncodeReplaceChar write FEncodeReplaceChar; procedure Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); override; function Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; override; function DecodeChar(const P: AnsiChar): WideChar; virtual; abstract; function EncodeChar(const Ch: WideChar): AnsiChar; virtual; abstract; function DecodeUCS4Char(const P: AnsiChar): UCS4Char; virtual; function EncodeUCS4Char(const Ch: UCS4Char): AnsiChar; virtual; end; TUnicodeSingleByteCodecClass = class of TCustomSingleByteCodec; { } { ISO-8859 } { } type TISO8859_1Codec = class(TCustomSingleByteCodec) public procedure Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); override; function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TISO8859_2Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TISO8859_3Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TISO8859_4Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TISO8859_5Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TISO8859_6Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TISO8859_7Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TISO8859_8Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TISO8859_9Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TISO8859_10Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TISO8859_13Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TISO8859_14Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TISO8859_15Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; { } { Windows } { } type TWindows37Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows437Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows500Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows708Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows737Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows775Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows850Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows852Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows855Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows857Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows858Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows861Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows862Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows863Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows864Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows865Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows866Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows869Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows870Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows874Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows875Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1026Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1047Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1140Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1141Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1142Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1143Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1144Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1145Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1146Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1147Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1148Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1149Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1250Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1251Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1252Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1253Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1254Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1255Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1256Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1257Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TWindows1258Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; { } { IBM } { } type TIBM037Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM038Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM256Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM273Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM274Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM275Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM277Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM278Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM280Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM281Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM284Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM285Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM290Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM297Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM420Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM423Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM424Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM437Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM500Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM850Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM851Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM852Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM855Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM857Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM860Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM861Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM862Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM863Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM864Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM865Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM866Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM868Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM869Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM870Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM871Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM874Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM875Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM880Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM904Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM905Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM918Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM1004Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM1026Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TIBM1047Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; { } { Macintosh } { } type TMacLatin2Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TMacRomanCodec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TMacCyrillicCodec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TMacGreekCodec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TMacIcelandicCodec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TMacTurkishCodec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; { } { International } { } type TUSASCIICodec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TEBCDIC_USCodec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TKOI8_RCodec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TJIS_X0201Codec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; TNextStepCodec = class(TCustomSingleByteCodec) public function DecodeChar(const P: AnsiChar): WideChar; override; function EncodeChar(const Ch: WideChar): AnsiChar; override; end; implementation {$IFNDEF LINUX} uses { Delphi } Windows; {$ENDIF} resourcestring SCannotConvert = 'Unicode code point $%x has no equivalent in %s'; SCannotConvertUCS4 = 'Cannot convert $%8.8X to %s'; SHighSurrogateNotFound = 'High surrogate not found'; SInvalidCodePoint = '$%x is not a valid %s code point'; SInvalidEncoding = 'Invalid %s encoding'; SLongStringConvertError = 'Long string conversion error'; SLowSurrogateNotFound = 'Low surrogate not found'; SSurrogateNotAllowed = 'Surrogate value $%x found in %s. Values between $D800 and $DFFF are reserved for use with UTF-16'; SEncodingOutOfRange = '%s encoding out of range'; SUTF8Error = 'UTF-8 error %d'; { } { Type definitions } { } {$IFNDEF DELPHI6_UP} type PByte = ^Byte; PWord = ^Word; PLongWord = ^LongWord; {$ENDIF} { } { US-ASCII String functions } { } function IsUSASCIIString(const S: AnsiString): Boolean; var I : Integer; P : PAnsiChar; begin P := Pointer(S); For I := 1 to Length(S) do if Ord(P^) >= $80 then begin Result := False; exit; end else Inc(P); Result := True; end; function IsUSASCIIWideBuf(const Buf: PWideChar; const Len: Integer): Boolean; var I : Integer; P : PWideChar; begin P := Buf; For I := 1 to Len do if Ord(P^) >= $80 then begin Result := False; exit; end else Inc(P); Result := True; end; function IsUSASCIIWideString(const S: WideString): Boolean; begin Result := IsUSASCIIWideBuf(Pointer(S), Length(S)); end; { } { Long string functions } { } procedure LongToWide(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer); var I : Integer; P : Pointer; Q : Pointer; V : LongWord; begin if BufSize <= 0 then exit; P := Buf; Q := DestBuf; For I := 1 to BufSize div 4 do begin // convert 4 characters per iteration V := PLongWord(P)^; Inc(PLongWord(P)); PLongWord(Q)^ := (V and $FF) or ((V and $FF00) shl 8); Inc(PLongWord(Q)); V := V shr 16; PLongWord(Q)^ := (V and $FF) or ((V and $FF00) shl 8); Inc(PLongWord(Q)); end; // convert remaining (<4) For I := 1 to BufSize mod 4 do begin PWord(Q)^ := PByte(P)^; Inc(PByte(P)); Inc(PWord(Q)); end; end; function LongStringToWideString(const S: AnsiString): WideString; var L : Integer; begin L := Length(S); SetLength(Result, L); if L = 0 then exit; LongToWide(Pointer(S), L, Pointer(Result)); end; procedure WideToLong(const Buf: Pointer; const Len: Integer; const DestBuf: Pointer); var I : Integer; S : PWideChar; Q : PAnsiChar; V : LongWord; W : Word; begin if Len <= 0 then exit; S := Buf; Q := DestBuf; For I := 1 to Len div 2 do begin // convert 2 characters per iteration V := PLongWord(S)^; if V and $FF00FF00 <> 0 then raise EConvertError.Create(SLongStringConvertError); Q^ := AnsiChar(V); Inc(Q); Q^ := AnsiChar(V shr 16); Inc(Q); Inc(S, 2); end; // convert remaining character if Len mod 2 = 1 then begin W := Ord(S^); if W > $FF then raise EConvertError.Create(SLongStringConvertError); Q^ := AnsiChar(W); end; end; function WideToLongString(const P: PWideChar; const Len: Integer): AnsiString; var I : Integer; S : PWideChar; Q : PAnsiChar; V : WideChar; begin if Len <= 0 then begin Result := ''; exit; end; SetLength(Result, Len); S := P; Q := Pointer(Result); For I := 1 to Len do begin V := S^; if Ord(V) > $FF then raise EConvertError.Create(SLongStringConvertError); Q^ := AnsiChar(Byte(V)); Inc(S); Inc(Q); end; end; function WideStringToLongString(const S: WideString): AnsiString; begin Result := WideToLongString(Pointer(S), Length(S)); end; { } { UTF-8 character conversion functions } { } { UTF8ToUCS4Char returns UTF8ErrorNone if a valid UTF-8 sequence was decoded } { (and Ch contains the decoded UCS4 character and SeqSize contains the size } { of the UTF-8 sequence). If an incomplete UTF-8 sequence is encountered, the } { function returns UTF8ErrorIncompleteEncoding and SeqSize > Size. If an } { invalid UTF-8 sequence is encountered, the function returns } { UTF8ErrorInvalidEncoding and SeqSize (<= Size) is the size of the } { invalid sequence, and Ch may be the intended character. } function UTF8ToUCS4Char(const P: PChar; const Size: Integer; out SeqSize: Integer; out Ch: UCS4Char): TUTF8Error; var C, D : Byte; V : LongWord; I : Integer; begin if not Assigned(P) or (Size <= 0) then begin SeqSize := 0; Ch := 0; Result := UTF8ErrorInvalidBuffer; exit; end; C := Ord(P^); if C < $80 then begin SeqSize := 1; Ch := C; Result := UTF8ErrorNone; exit; end; // multi-byte characters always start with 11xxxxxx ($C0) // following bytes always start with 10xxxxxx ($80) if C and $C0 = $80 then begin SeqSize := 1; Ch := C; Result := UTF8ErrorInvalidEncoding; exit; end; if C and $20 = 0 then // 2-byte sequence begin SeqSize := 2; V := C and $1F; end else if C and $10 = 0 then // 3-byte sequence begin SeqSize := 3; V := C and $0F; end else if C and $08 = 0 then // 4-byte sequence (max needed for Unicode $0-$1FFFFF) begin SeqSize := 4; V := C and $07; end else begin SeqSize := 1; Ch := C; Result := UTF8ErrorInvalidEncoding; exit; end; if Size < SeqSize then // incomplete begin Ch := C; Result := UTF8ErrorIncompleteEncoding; exit; end; For I := 1 to SeqSize - 1 do begin D := Ord(P[I]); if D and $C0 <> $80 then // following byte must start with 10xxxxxx begin SeqSize := 1; Ch := C; Result := UTF8ErrorInvalidEncoding; exit; end; V := (V shl 6) or (D and $3F); // decode 6 bits end; Ch := V; Result := UTF8ErrorNone; end; function UTF8ToWideChar(const P: PChar; const Size: Integer; out SeqSize: Integer; out Ch: WideChar): TUTF8Error; var Ch4 : UCS4Char; begin Result := UTF8ToUCS4Char(P, Size, SeqSize, Ch4); if Ch4 > $FFFF then begin Result := UTF8ErrorOutOfRange; Ch := #$0000; end else Ch := WideChar(Ch4); end; { UCS4CharToUTF8 transforms the UCS4 char Ch to UTF-8 encoding. SeqSize } { returns the number of bytes needed to transform Ch. Up to DestSize } { bytes of the UTF-8 encoding will be placed in Dest. } procedure UCS4CharToUTF8(const Ch: UCS4Char; const Dest: Pointer; const DestSize: Integer; out SeqSize: Integer); var P : PByte; begin P := Dest; if Ch < $80 then // US-ASCII (1-byte sequence) begin SeqSize := 1; if not Assigned(P) or (DestSize <= 0) then exit; P^ := Byte(Ch); end else if Ch < $800 then // 2-byte sequence begin SeqSize := 2; if not Assigned(P) or (DestSize <= 0) then exit; P^ := $C0 or Byte(Ch shr 6); if DestSize = 1 then exit; Inc(P); P^ := $80 or (Ch and $3F); end else if Ch < $10000 then // 3-byte sequence begin SeqSize := 3; if not Assigned(P) or (DestSize <= 0) then exit; P^ := $E0 or Byte(Ch shr 12); if DestSize = 1 then exit; Inc(P); P^ := $80 or ((Ch shr 6) and $3F); if DestSize = 2 then exit; Inc(P); P^ := $80 or (Ch and $3F); end else if Ch < $200000 then // 4-byte sequence begin SeqSize := 4; if not Assigned(P) or (DestSize <= 0) then exit; P^ := $F0 or Byte(Ch shr 18); if DestSize = 1 then exit; Inc(P); P^ := $80 or ((Ch shr 12) and $3F); if DestSize = 2 then exit; Inc(P); P^ := $80 or ((Ch shr 6) and $3F); if DestSize = 3 then exit; Inc(P); P^ := $80 or (Ch and $3F); end else raise EConvertError.CreateFmt(SInvalidCodePoint, [Ord(Ch), 'Unicode']); end; procedure WideCharToUTF8(const Ch: WideChar; const Dest: Pointer; const DestSize: Integer; out SeqSize: Integer); begin UCS4CharToUTF8(Ord(Ch), Dest, DestSize, SeqSize); end; { } { UTF-16 character conversion functions } { } { UCS4CharToUTF16BE transforms the UCS4 char Ch to UTF-16BE encoding. SeqSize } { returns the number of bytes needed to transform Ch. Up to DestSize } { bytes of the UTF-16BE encoding will be placed in Dest. } procedure UCS4CharToUTF16BE(const Ch: UCS4Char; const Dest: Pointer; const DestSize: Integer; out SeqSize: Integer); var P : PByte; HighSurrogate, LowSurrogate : Word; begin P := Dest; Case Ch of $00000080..$0000D7FF, $0000E000..$0000FFFD : begin SeqSize := 2; if not Assigned(P) or (DestSize <= 0) then exit; P^ := Hi(Ch); if DestSize <= 1 then exit; Inc(P); P^ := Lo(Ch); end; $0000D800..$0000DFFF, $0000FFFE,$0000FFFF : raise EConvertError.CreateFmt(SInvalidCodePoint, [Ch, 'UCS-4']); $00010000..$0010FFFF : begin SeqSize := 4; if not Assigned(P) or (DestSize <= 0) then exit; HighSurrogate := $D7C0 + (Ch shr 10); P^ := Hi(HighSurrogate); if DestSize <= 1 then exit; Inc(P); P^ := Lo(HighSurrogate); if DestSize <= 2 then exit; LowSurrogate := $DC00 xor (Ch and $3FF); Inc(P); P^ := Hi(LowSurrogate); if DestSize <= 3 then exit; Inc(P); P^ := Lo(LowSurrogate); end; else // out of UTF-16 range raise EConvertError.CreateFmt(SCannotConvertUCS4, [Ch, 'UTF-16BE']); end; end; { UCS4CharToUTF16LE transforms the UCS4 char Ch to UTF-16LE encoding. SeqSize } { returns the number of bytes needed to transform Ch. Up to DestSize } { bytes of the UTF-16LE encoding will be placed in Dest. } procedure UCS4CharToUTF16LE(const Ch: UCS4Char; const Dest: Pointer; const DestSize: Integer; out SeqSize: Integer); var P : PByte; HighSurrogate, LowSurrogate : Word; begin P := Dest; Case Ch of $00000080..$0000D7FF, $0000E000..$0000FFFD : begin SeqSize := 2; if not Assigned(P) or (DestSize <= 0) then exit; P^ := Lo(Ch); if DestSize <= 1 then exit; Inc(P); P^ := Hi(Ch); end; $0000D800..$0000DFFF, $0000FFFE, $0000FFFF : raise EConvertError.CreateFmt(SInvalidCodePoint, [Ch, 'UCS-4']); $00010000..$0010FFFF: begin SeqSize := 4; if not Assigned(P) or (DestSize <= 0) then exit; HighSurrogate := $D7C0 + (Ch shr 10); P^ := Lo(HighSurrogate); if DestSize <= 1 then exit; Inc(P); P^ := Hi(HighSurrogate); if DestSize <= 2 then exit; LowSurrogate := $DC00 xor (Ch and $3FF); Inc(P); P^ := Lo(LowSurrogate); if DestSize <= 3 then exit; Inc(P); P^ := Hi(LowSurrogate); end; else // out of UTF-16 range raise EConvertError.CreateFmt(SCannotConvertUCS4, [Ch, 'UTF-16LE']); end; end; { } { UTF-8 string functions } { } function DetectUTF8BOM(const P: PChar; const Size: Integer): Boolean; var Q : PChar; begin Result := False; if Assigned(P) and (Size >= 3) and (P^ = #$EF) then begin Q := P; Inc(Q); if Q^ = #$BB then begin Inc(Q); if Q^ = #$BF then Result := True; end; end; end; function UTF8CharSize(const P: PChar; const Size: Integer): Integer; var C : Byte; I : Integer; Q : PChar; begin if not Assigned(P) or (Size <= 0) then begin Result := 0; exit; end; C := Ord(P^); if C < $80 then // 1-byte (US-ASCII value) Result := 1 else if C and $C0 = $80 then // invalid encoding Result := 1 else begin // multi-byte character if C and $20 = 0 then Result := 2 else if C and $10 = 0 then Result := 3 else if C and $08 = 0 then Result := 4 else begin Result := 1; // invalid encoding exit; end; if Size < Result then // incomplete encoding exit; Q := P; Inc(Q); For I := 1 to Result - 1 do if Ord(Q^) and $C0 <> $80 then begin Result := 1; // invalid encoding exit; end else Inc(Q); end; end; function UTF8BufLength(const P: PChar; const Size: Integer): Integer; var Q : PChar; L, C : Integer; begin Q := P; L := Size; Result := 0; While L > 0 do begin C := UTF8CharSize(Q, L); Dec(L, C); Inc(Q, C); Inc(Result); end; end; function UTF8StringLength(const S: String): Integer; begin Result := UTF8BufLength(Pointer(S), Length(S)); end; function UCS4CharToUTF8CharSize(const Ch: UCS4Char): Integer; begin if Ch < $80 then Result := 1 else if Ch < $800 then Result := 2 else if Ch < $10000 then Result := 3 else if Ch < $200000 then Result := 4 else raise EConvertError.CreateFmt(SInvalidCodePoint, [Ord(Ch), 'Unicode']); end; function WideBufToUTF8Size(const Buf: PWideChar; const Len: Integer): Integer; var P : PWideChar; I : Integer; C : UCS4Char; begin P := Buf; Result := 0; For I := 1 to Len do begin C := UCS4Char(P^); Inc(Result); if C >= $80 then if C >= $800 then Inc(Result, 2) else Inc(Result); Inc(P); end; end; function LongBufToUTF8Size(const Buf: PChar; const Len: Integer): Integer; var P : PChar; I : Integer; begin P := Buf; Result := 0; For I := 1 to Len do begin Inc(Result); if Ord(P^) >= $80 then Inc(Result); Inc(P); end; end; function WideStringToUTF8Size(const S: WideString): Integer; begin Result := WideBufToUTF8Size(Pointer(S), Length(S)); end; function LongStringToUTF8Size(const S: String): Integer; begin Result := LongBufToUTF8Size(Pointer(S), Length(S)); end; function UTF8StringToWideString(const S: String): WideString; var P : PChar; Q : PWideChar; L, M, I : Integer; C : WideChar; begin L := Length(S); if L = 0 then begin Result := ''; exit; end; if IsUSASCIIString(S) then // optimize for US-ASCII strings begin Result := LongStringToWideString(S); exit; end; // Decode UTF-8 P := Pointer(S); SetLength(Result, L); // maximum size Q := Pointer(Result); M := 0; Repeat UTF8ToWideChar(P, L, I, C); Assert(I > 0, 'I > 0'); Q^ := C; Inc(Q); Inc(M); Inc(P, I); Dec(L, I); Until L <= 0; SetLength(Result, M); // actual size end; function UTF8StringToLongString(const S: String): String; var P : PChar; Q : PChar; L, M, I : Integer; C : WideChar; begin L := Length(S); if L = 0 then begin Result := ''; exit; end; if IsUSASCIIString(S) then // optimize for US-ASCII strings begin Result := S; exit; end; // Decode UTF-8 P := Pointer(S); SetLength(Result, L); // maximum size Q := Pointer(Result); M := 0; Repeat UTF8ToWideChar(P, L, I, C); Assert(I > 0, 'I > 0'); if Ord(C) > $FF then raise EConvertError.Create(SLongStringConvertError); Q^ := Char(Ord(C)); Inc(Q); Inc(M); Inc(P, I); Dec(L, I); Until L <= 0; SetLength(Result, M); // actual size end; function WideBufToUTF8String(const Buf: PWideChar; const Len: Integer): String; var P : PWideChar; Q : PChar; I, M, N, J : Integer; begin if Len = 0 then begin Result := ''; exit; end; N := WideBufToUTF8Size(Buf, Len); if N = Len then // optimize for US-ASCII strings begin Result := WideToLongString(Buf, Len); exit; end; SetLength(Result, N); P := Buf; Q := Pointer(Result); M := 0; For I := 1 to Len do begin UCS4CharToUTF8(UCS4Char(P^), Q, N, J); Inc(P); Inc(Q, J); Dec(N, J); Inc(M, J); end; SetLength(Result, M); // actual size end; function LongStringToUTF8String(const S: String): String; var P : PChar; Q : PChar; I, M, N : Integer; J, L : Integer; begin P := Pointer(S); L := Length(S); if L = 0 then begin Result := ''; exit; end; N := LongBufToUTF8Size(P, L); if N = L then // optimize for US-ASCII strings begin Result := S; exit; end; SetLength(Result, N); Q := Pointer(Result); M := 0; For I := 1 to L do begin UCS4CharToUTF8(UCS4Char(Ord(P^)), Q, N, J); Inc(P); Inc(Q, J); Dec(N, J); Inc(M, J); end; SetLength(Result, M); // actual size end; function WideStringToUTF8String(const S: WideString): String; begin Result := WideBufToUTF8String(Pointer(S), Length(S)); end; const MaxUTF8SequenceSize = 4; function UCS4CharToUTF8String(const Ch: UCS4Char): String; var Buf : Array[0..MaxUTF8SequenceSize - 1] of Byte; Size, I : Integer; P, Q : PChar; begin Size := 0; UCS4CharToUTF8(Ch, @Buf, Sizeof(Buf), Size); SetLength(Result, Size); if Size > 0 then begin P := Pointer(Result); Q := @Buf; For I := 0 to Size - 1 do begin P^ := Q^; Inc(P); Inc(Q); end; end; end; function ISO8859_1StringToUTF8String(const S: String): String; var P, Q : PChar; L, I, M, J : Integer; begin L := Length(S); if L = 0 then begin Result := ''; exit; end; // Calculate size M := L; P := Pointer(S); For I := 1 to L do begin if Ord(P^) >= $80 then Inc(M); // 2 bytes required for #$80-#$FF Inc(P); end; // Check if conversion is required if M = L then begin // All characters are US-ASCII, return reference to same string Result := S; exit; end; // Convert SetLength(Result, M); Q := Pointer(Result); P := Pointer(S); For I := 1 to L do begin WideCharToUTF8(WideChar(P^), Q, M, J); Inc(P); Inc(Q, J); Dec(M, J); end; end; { } { UTF-16 functions } { } function DetectUTF16BEBOM(const P: PChar; const Size: Integer): Boolean; begin Result := Assigned(P) and (Size >= Sizeof(WideChar)) and (PWideChar(P)^ = WideChar($FEFF)); end; function DetectUTF16LEBOM(const P: PChar; const Size: Integer): Boolean; begin Result := Assigned(P) and (Size >= Sizeof(WideChar)) and (PWideChar(P)^ = WideChar($FFFE)); end; { DetectUTF16Encoding returns True if the encoding was confirmed to be UTF-16. } { SwapEndian is True if it was detected that the UTF-16 data is in reverse } { endian from that used by the cpu. } function DetectUTF16BOM(const P: PChar; const Size: Integer; out SwapEndian: Boolean): Boolean; begin if not Assigned(P) or (Size < Sizeof(WideChar)) then begin SwapEndian := False; Result := False; end else if PWideChar(P)^ = WideChar($FEFF) then begin SwapEndian := False; Result := True; end else if PWideChar(P)^ = WideChar($FFFE) then begin SwapEndian := True; Result := True; end else begin SwapEndian := False; Result := False; end; end; function SwapUTF16Endian(const P: WideChar): WideChar; begin Result := WideChar(((Ord(P) and $FF) shl 8) or (Ord(P) shr 8)); end; { } { Helper Functions } { } type AnsiCharMap = Array[AnsiChar] of WideChar; function CharFromMap(const Ch: WideChar; const Map: AnsiCharMap; const Encoding: String): AnsiChar; var I : AnsiChar; P : PWideChar; begin if Ch = #$FFFF then raise EConvertError.CreateFmt(SCannotConvert, [Ord(Ch), Encoding]); P := @Map; for I := #$00 to #$FF do if P^ <> Ch then Inc(P) else begin Result := I; exit; end; raise EConvertError.CreateFmt(SCannotConvert, [Ord(Ch), Encoding]); end; type AnsiCharHighMap = Array[#$80..#$FF] of WideChar; function CharFromHighMap(const Ch: WideChar; const Map: AnsiCharHighMap; const Encoding: String): AnsiChar; var I : AnsiChar; P : PWideChar; begin if Ord(Ch) < $80 then begin Result := AnsiChar(Ch); exit; end; if Ch = #$FFFF then raise EConvertError.CreateFmt(SCannotConvert, [Ord(Ch), Encoding]); P := @Map; for I := #$80 to #$FF do if P^ <> Ch then Inc(P) else begin Result := I; exit; end; raise EConvertError.CreateFmt(SCannotConvert, [Ord(Ch), Encoding]); end; type AnsiCharISOMap = Array[#$A0..#$FF] of WideChar; function CharFromISOMap(const Ch: WideChar; const Map: AnsiCharISOMap; const Encoding: String): AnsiChar; var I : AnsiChar; P : PWideChar; begin if Ord(Ch) < $A0 then begin Result := AnsiChar(Ch); exit; end; if Ch = #$FFFF then raise EConvertError.CreateFmt(SCannotConvert, [Ord(Ch), Encoding]); P := @Map; for I := #$A0 to #$FF do if P^ <> Ch then Inc(P) else begin Result := I; exit; end; raise EConvertError.CreateFmt(SCannotConvert, [Ord(Ch), Encoding]); end; { } { Unicode Codec aliases } { } const USASCIIAliases = 17; USASCIIAlias : Array[0..USASCIIAliases - 1] of String = ( 'ASCII', 'US-ASCII', 'us', 'ANSI_X3.4-1968', 'ANSI_X3.4-1986', 'iso-ir-6', 'ISO_646.irv:1991', 'ISO_646.irv', 'ISO_646', 'ISO-646', 'ISO646', 'ISO646-US', 'IBM367', 'cp367', 'csASCII', 'IBM891', 'IBM903'); { } { ISO-8859-1 - Latin 1 } { Western Europe and Americas: Afrikaans, Basque, Catalan, Danish, Dutch, } { English, Faeroese, Finnish, French, Galician, German, Icelandic, Irish, } { Italian, Norwegian, Portuguese, Spanish and Swedish. } { Default for HTTP Protocol } { } const ISO8859_1Aliases = 8; ISO8859_1Alias : Array[0..ISO8859_1Aliases - 1] of String = ( 'ISO-8859-1', 'ISO_8859-1:1987', 'ISO_8859-1', 'iso-ir-100', 'latin1', 'l1', 'IBM819', 'cp819'); { } { ISO-8859-2 Latin 2 } { Latin-written Slavic and Central European languages: Czech, German, } { Hungarian, Polish, Romanian, Croatian, Slovak, Slovene. } { } const ISO8859_2Aliases = 6; ISO8859_2Alias : Array[0..ISO8859_2Aliases - 1] of String = ( 'ISO-8859-2', 'ISO_8859-2:1987', 'ISO_8859-2', 'iso-ir-101', 'latin2', 'l2'); { } { ISO-8859-3 - Latin 3 } { Esperanto, Galician, Maltese, and Turkish. } { } const ISO8859_3Aliases = 6; ISO8859_3Alias : Array[0..ISO8859_3Aliases - 1] of String = ( 'ISO-8859-3', 'ISO_8859-3:1988', 'ISO_8859-3', 'iso-ir-109', 'latin3', 'l3'); { } { ISO-8859-4 - Latin 4 } { Scandinavia/Baltic (mostly covered by 8859-1 also): Estonian, Latvian, and } { Lithuanian. It is an incomplete predecessor of Latin 6. } { } const ISO8859_4Aliases = 6; ISO8859_4Alias : Array[0..ISO8859_4Aliases - 1] of String = ( 'ISO-8859-4', 'ISO_8859-4:1988', 'ISO_8859-4', 'iso-ir-110', 'latin4', 'l4'); { } { ISO-8859-5 - Cyrillic } { Bulgarian, Byelorussian, Macedonian, Russian, Serbian and Ukrainian. } { } const ISO8859_5Aliases = 5; ISO8859_5Alias : Array[0..ISO8859_5Aliases - 1] of String = ( 'ISO-8859-5', 'ISO_8859-5:1988', 'ISO_8859-5', 'iso-ir-144', 'cyrillic'); { } { ISO-8859-6 - Arabic } { Non-accented Arabic. } { } const ISO8859_6Aliases = 7; ISO8859_6Alias : Array[0..ISO8859_6Aliases - 1] of String = ( 'ISO-8859-6', 'ISO_8859-6:1987', 'ISO_8859-6', 'iso-ir-127', 'ECMA-114', 'ASMO-708', 'arabic'); { } { ISO-8859-7 - Modern Greek } { Greek. } { } const ISO8859_7Aliases = 8; ISO8859_7Alias : Array[0..ISO8859_7Aliases - 1] of String = ( 'ISO-8859-7', 'ISO_8859-7:1987', 'ISO_8859-7', 'iso-ir-126', 'ELOT_928', 'ECMA-118', 'greek', 'greek8'); { } { ISO-8859-8 - Hebrew } { Non-accented Hebrew. } { } const ISO8859_8Aliases = 5; ISO8859_8Alias : Array[0..ISO8859_8Aliases - 1] of String = ( 'ISO-8859-8', 'ISO_8859-8:1988', 'ISO_8859-8', 'iso-ir-138', 'hebrew'); { } { ISO-8859-9 - Latin 5 } { Same as 8859-1 except for Turkish instead of Icelandic } { } const ISO8859_9Aliases = 6; ISO8859_9Alias : Array[0..ISO8859_9Aliases - 1] of String = ( 'ISO-8859-9', 'ISO_8859-9:1989', 'ISO_8859-9', 'iso-ir-148', 'latin5', 'l5'); { } { ISO-8859-10 - Latin 6 } { Latin6, for Lappish/Nordic/Eskimo languages: Adds the last Inuit } { (Greenlandic) and Sami (Lappish) letters that were missing in Latin 4 to } { cover the entire Nordic area. } { } const ISO8859_10Aliases = 6; ISO8859_10Alias : Array[0..ISO8859_10Aliases - 1] of String = ( 'ISO-8859-10', 'ISO_8859-10:1992', 'ISO_8859-10', 'iso-ir-157', 'latin6', 'l6'); { } { ISO-8859-13 - Latin 7 } { } const ISO8859_13Aliases = 4; ISO8859_13Alias : Array[0..ISO8859_13Aliases - 1] of String = ( 'ISO-8859-13', 'ISO_8859-13', 'latin7', 'l7'); { } { ISO-8859-14 - Latin 8 } { } const ISO8859_14Aliases = 7; ISO8859_14Alias : Array[0..ISO8859_14Aliases - 1] of String = ( 'ISO-8859-14', 'ISO_8859-14:1998', 'ISO_8859-14', 'iso-ir-199', 'latin8', 'l8', 'iso-celtic'); { } { ISO-8859-15 - Latin 9 } { } const ISO8859_15Aliases = 6; ISO8859_15Alias : Array[0..ISO8859_15Aliases - 1] of String = ( 'ISO-8859-15', 'ISO_8859-15', 'latin9', 'l9', 'latin0', 'l0'); { } { KOI8-R } { } const KOI8_RAliases = 1; KOI8_RAlias : Array[0..KOI8_RAliases - 1] of String = ( 'KOI8-R'); { } { Mac Latin-2 } { } const MacLatin2Aliases = 3; MacLatin2Alias : Array[0..MacLatin2Aliases - 1] of String = ( 'MacLatin2', 'Mac', 'Macintosh'); { } { Mac Roman } { } const MacRomanAliases = 1; MacRomanAlias : Array[0..MacRomanAliases - 1] of String = ( 'MacRoman'); { } { Mac Cyrillic } { } const MacCyrillicAliases = 1; MacCyrillicAlias : Array[0..MacCyrillicAliases - 1] of String = ( 'MacCyrillic'); { } { CP437 - DOSLatinUS } { Original IBM PC encoding } { } const CP437Aliases = 3; CP437Alias : Array[0..CP437Aliases - 1] of String = ( 'IBM437', 'cp437', 'DOSLatinUS'); { } { Windows-1250 } { } const Win1250Aliases = 3; Win1250Alias : Array[0..Win1250Aliases - 1] of String = ( 'windows-1250', 'cp1250', 'WinLatin2'); { } { Windows-1251 } { } const Win1251Aliases = 3; Win1251Alias : Array[0..Win1251Aliases - 1] of String = ( 'windows-1251', 'cp1251', 'WinCyrillic'); { } { Windows-1252 } { } const Win1252Aliases = 3; Win1252Alias : Array[0..Win1252Aliases - 1] of String = ( 'windows-1252', 'cp1252', 'WinLatin1'); { } { EBCDIC-US } { } const EBCDIC_USAliases = 2; EBCDIC_USAlias : Array[0..EBCDIC_USAliases - 1] of String = ( 'ebcdic-us', 'ebcdic'); { } { UTF-8 } { } const UTF8Aliases = 2; UTF8Alias : Array[0..UTF8Aliases - 1] of String = ( 'UTF-8', 'utf8'); { } { UTF-16BE } { } const UTF16BEAliases = 3; UTF16BEAlias : Array[0..UTF16BEAliases - 1] of String = ( 'UTF-16BE', 'UTF-16', 'utf16'); { } { UTF-16LE } { } const UTF16LEAliases = 2; UTF16LEAlias : Array[0..UTF16LEAliases - 1] of String = ( 'UTF-16LE', 'utf16le'); { } { Unicode Codec alias table } { } type UnicodeCodecAliasInfo = record Table : Pointer; Count : Integer; Codec : TUnicodeCodecClass; end; const UnicodeCodecAliasEntries = 26; UnicodeCodecAliasList : Array[0..UnicodeCodecAliasEntries - 1] of UnicodeCodecAliasInfo = ((Table:@USASCIIAlias; Count: USASCIIAliases; Codec: TUSASCIICodec), (Table:@ISO8859_1Alias; Count: ISO8859_1Aliases; Codec: TISO8859_1Codec), (Table:@ISO8859_2Alias; Count: ISO8859_2Aliases; Codec: TISO8859_2Codec), (Table:@ISO8859_3Alias; Count: ISO8859_3Aliases; Codec: TISO8859_3Codec), (Table:@ISO8859_4Alias; Count: ISO8859_4Aliases; Codec: TISO8859_4Codec), (Table:@ISO8859_5Alias; Count: ISO8859_5Aliases; Codec: TISO8859_5Codec), (Table:@ISO8859_6Alias; Count: ISO8859_6Aliases; Codec: TISO8859_6Codec), (Table:@ISO8859_7Alias; Count: ISO8859_7Aliases; Codec: TISO8859_7Codec), (Table:@ISO8859_8Alias; Count: ISO8859_8Aliases; Codec: TISO8859_8Codec), (Table:@ISO8859_9Alias; Count: ISO8859_9Aliases; Codec: TISO8859_9Codec), (Table:@ISO8859_10Alias; Count: ISO8859_10Aliases; Codec: TISO8859_10Codec), (Table:@ISO8859_13Alias; Count: ISO8859_13Aliases; Codec: TISO8859_13Codec), (Table:@ISO8859_14Alias; Count: ISO8859_14Aliases; Codec: TISO8859_14Codec), (Table:@ISO8859_15Alias; Count: ISO8859_15Aliases; Codec: TISO8859_15Codec), (Table:@KOI8_RAlias; Count: KOI8_RAliases; Codec: TKOI8_RCodec), (Table:@MacLatin2Alias; Count: MacLatin2Aliases; Codec: TMacLatin2Codec), (Table:@MacRomanAlias; Count: MacRomanAliases; Codec: TMacRomanCodec), (Table:@MacCyrillicAlias; Count: MacCyrillicAliases; Codec: TMacCyrillicCodec), (Table:@CP437Alias; Count: CP437Aliases; Codec: TIBM037Codec), (Table:@Win1250Alias; Count: Win1250Aliases; Codec: TWindows1250Codec), (Table:@Win1251Alias; Count: Win1251Aliases; Codec: TWindows1251Codec), (Table:@Win1252Alias; Count: Win1252Aliases; Codec: TWindows1252Codec), (Table:@EBCDIC_USAlias; Count: EBCDIC_USAliases; Codec: TEBCDIC_USCodec), (Table:@UTF8Alias; Count: UTF8Aliases; Codec: TUTF8Codec), (Table:@UTF16BEAlias; Count: UTF16BEAliases; Codec: TUTF16BECodec), (Table:@UTF16LEAlias; Count: UTF16LEAliases; Codec: TUTF16LECodec) ); { } { Unicode Codec alias functions } { } function GetCodecClassByAlias(const CodecAlias: String): TUnicodeCodecClass; var I, J : Integer; P : PString; begin For I := 0 to UnicodeCodecAliasEntries - 1 do begin P := UnicodeCodecAliasList[I].Table; For J := 0 to UnicodeCodecAliasList[I].Count - 1 do begin if AnsiCompareText(CodecAlias, P^) = 0 then begin Result := UnicodeCodecAliasList[I].Codec; exit; end; Inc(P); end; end; Result := nil; end; function GetEncodingName(const CodecClass: TUnicodeCodecClass): String; var I : Integer; begin For I := 0 to UnicodeCodecAliasEntries - 1 do if UnicodeCodecAliasList[I].Codec = CodecClass then begin Result := PString(UnicodeCodecAliasList[I].Table)^; exit; end; Result := ''; end; {$IFDEF OS_MSWIN} { } { MSWindows system encoding functions } { } function GetSystemEncodingName: String; begin // GetACP returns the current ANSI code-page identifier for the system, // or a default identifier if no code page is current. Case GetACP of 874 : Result := 'cp874'; // Thai 932 : Result := 'cp932'; // Japan 936 : Result := 'cp936'; // Chinese (PRC, Singapore) 949 : Result := 'cp949'; // Korean 950 : Result := 'cp950'; // Chinese (Taiwan, Hong Kong) 1200 : Result := 'ISO-10646-UCS-2'; // Unicode (BMP of ISO 10646) 1250 : Result := 'windows-1250'; // Windows 3.1 Eastern European 1251 : Result := 'windows-1251'; // Windows 3.1 Cyrillic 1252 : Result := 'windows-1252'; // Windows 3.1 Latin 1 (US, Western Europe) 1253 : Result := 'windows-1253'; // Windows 3.1 Greek 1254 : Result := 'windows-1254'; // Windows 3.1 Turkish 1255 : Result := 'windows-1255'; // Hebrew 1256 : Result := 'windows-1256'; // Arabic 1257 : Result := 'windows-1257'; // Baltic else Result := ''; end; end; function GetSystemEncodingCodecClass: TUnicodeCodecClass; begin Case GetACP of 874 : Result := TWindows874Codec; // Thai 932 : Result := nil; // Japan -- Not supported 936 : Result := nil; // Chinese (PRC, Singapore) -- Not supported 949 : Result := nil; // Korean -- Not supported 950 : Result := nil; // Chinese (Taiwan, Hong Kong) -- Not supported 1200 : Result := nil; // Unicode (BMP of ISO 10646) -- Not supported 1250 : Result := TWindows1250Codec; // Windows 3.1 Eastern European 1251 : Result := TWindows1251Codec; // Windows 3.1 Cyrillic 1252 : Result := TWindows1252Codec; // Windows 3.1 Latin 1 (US, Western Europe) 1253 : Result := TWindows1253Codec; // Windows 3.1 Greek 1254 : Result := TWindows1254Codec; // Windows 3.1 Turkish 1255 : Result := TWindows1255Codec; // Hebrew 1256 : Result := TWindows1256Codec; // Arabic 1257 : Result := TWindows1257Codec; // Baltic else Result := nil; end; end; {$ENDIF} { } { Encoding detection } { } function DetectUTFEncoding(const Buf: Pointer; const BufSize: Integer; var BOMSize: Integer): TUnicodeCodecClass; var R : Boolean; begin if DetectUTF16BOM(Buf, BufSize, R) then begin BOMSize := UTF16BOMSize; if R then Result := TUTF16LECodec else Result := TUTF16BECodec end else if DetectUTF8BOM(Buf, BufSize) then begin BOMSize := UTF8BOMSize; Result := TUTF8Codec; end else begin BOMSize := 0; Result := nil; end; end; { } { Unicode conversion functions } { } function EncodingToUTF16(const CodecClass: TUnicodeCodecClass; const Buf: Pointer; const BufSize: Integer): WideString; var C : TCustomUnicodeCodec; begin if not Assigned(CodecClass) then begin Result := ''; exit; end; C := CodecClass.Create; try C.DecodeStr(Buf, BufSize, Result); finally C.Free; end; end; function EncodingToUTF16(const CodecClass: TUnicodeCodecClass; const S: String): WideString; var C : TCustomUnicodeCodec; begin if not Assigned(CodecClass) then begin Result := ''; exit; end; C := CodecClass.Create; try C.DecodeStr(PChar(S), Length(S), Result); finally C.Free; end; end; function EncodingToUTF16(const CodecAlias: String; const Buf: Pointer; const BufSize: Integer): WideString; begin Result := EncodingToUTF16(GetCodecClassByAlias(CodecAlias), Buf, BufSize); end; function EncodingToUTF16(const CodecAlias, S: String): WideString; begin Result := EncodingToUTF16(GetCodecClassByAlias(CodecAlias), S); end; function UTF16ToEncoding(const CodecClass: TUnicodeCodecClass; const S: WideString): String; var C : TCustomUnicodeCodec; I : Integer; begin if not Assigned(CodecClass) then begin Result := ''; exit; end; C := CodecClass.Create; try Result := C.Encode(Pointer(S), Length(S), I); finally C.Free; end; end; function UTF16ToEncoding(const CodecAlias: String; const S: WideString): String; begin Result := UTF16ToEncoding(GetCodecClassByAlias(CodecAlias), S); end; { } { EUnicodeCodecException helper functions } { } procedure RaiseUnicodeCodecException(const Msg: String; const ProcessedBytes: Integer); overload; var E : EUnicodeCodecException; begin E := EUnicodeCodecException.Create(Msg); E.ProcessedBytes := ProcessedBytes; raise E; end; procedure RaiseUnicodeCodecException(const Msg: string; const Args: array of const; const ProcessedBytes: Integer); overload; var E : EUnicodeCodecException; begin E := EUnicodeCodecException.CreateFmt(Msg, Args); E.ProcessedBytes := ProcessedBytes; end; { } { TCustomUnicodeCodec } { } constructor TCustomUnicodeCodec.Create; begin inherited Create; FDecodeReplaceChar := WideChar(#$FFFD); FErrorAction := eaException; FReadLFOption := lrPass; FWriteLFOption := lwLF; ResetReadAhead; end; constructor TCustomUnicodeCodec.CreateEx(const AErrorAction: TCodecErrorAction; const ADecodeReplaceChar: WideChar; const AReadLFOption: TCodecReadLFOption; const AWriteLFOption: TCodecWriteLFOption); begin inherited Create; FErrorAction := AErrorAction; FDecodeReplaceChar := ADecodeReplaceChar; FReadLFOption := AReadLFOption; FWriteLFOption := AWriteLFOption; ResetReadAhead; end; procedure TCustomUnicodeCodec.ResetReadAhead; begin FReadAhead := False; FReadAheadBuffer := 0; end; procedure TCustomUnicodeCodec.SetDecodeReplaceChar(const Value: WideChar); begin FDecodeReplaceChar := Value; end; procedure TCustomUnicodeCodec.SetErrorAction(const Value: TCodecErrorAction); begin FErrorAction := Value; end; procedure TCustomUnicodeCodec.SetReadLFOption(const Value: TCodecReadLFOption); begin FReadLFOption := Value; end; procedure TCustomUnicodeCodec.SetWriteLFOption(const Value: TCodecWriteLFOption); begin FWriteLFOption := Value; end; procedure TCustomUnicodeCodec.SetOnRead(const Value: TCodecReadEvent); begin if @Value <> @FOnRead then begin ResetReadAhead; FOnRead := Value; end; end; procedure TCustomUnicodeCodec.DecodeStr(const Buf: Pointer; const BufSize: Integer; var Dest: WideString); var P : PChar; Q : PWideChar; L, M : Integer; I, J : Integer; begin P := Buf; L := BufSize; if not Assigned(P) or (L <= 0) then begin Dest := ''; exit; end; SetLength(Dest, BufSize); M := 0; Repeat Q := Pointer(Dest); Inc(Q, M); Decode(P, L, Q, BufSize * Sizeof(WideChar), I, J); Dec(L, I); Inc(P, I); Inc(M, J); if (J < BufSize) or (L <= 0) then break; SetLength(Dest, M + BufSize); Until False; if Length(Dest) <> M then SetLength(Dest, M); end; function TCustomUnicodeCodec.EncodeStr(const S: WideString): String; var I : Integer; begin Result := Encode(Pointer(S), Length(S), I); end; function TCustomUnicodeCodec.ReadBuffer(var Buf; Count: Integer): Boolean; begin Result := False; if Assigned(FOnRead) then FOnRead(self, Buf, Count, Result); end; procedure TCustomUnicodeCodec.WriteBuffer(const Buf; Count: Integer); begin if Assigned(FOnWrite) then FOnWrite(self, Buf, Count); end; procedure TCustomUnicodeCodec.ReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); begin // Get UCS4 character from read-ahead buffer or from InternalReadUCS4Char if FReadAhead then begin C := FReadAheadBuffer; ByteCount := FReadAheadByteCount; FReadAhead := False; end else InternalReadUCS4Char(C, ByteCount); // Adjust line breaks to Linux-style breaks with a single LINE FEED character if (C = UCS4_CR) and (ReadLFOption = lrNormalize) then begin InternalReadUCS4Char(FReadAheadBuffer, FReadAheadByteCount); if FReadAheadBuffer = UCS4_LF then Inc(ByteCount, FReadAheadByteCount) else FReadAhead := True; C := UCS4_LF; end; end; procedure TCustomUnicodeCodec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var ByteCount2 : Integer; begin if C = UCS4_LF then // Transform LINE FEED character Case WriteLFOption of lwLF : InternalWriteUCS4Char(UCS4_LF, ByteCount); lwCR : InternalWriteUCS4Char(UCS4_CR, ByteCount); lwCRLF : begin InternalWriteUCS4Char(UCS4_CR, ByteCount); InternalWriteUCS4Char(UCS4_LF, ByteCount2); Inc(ByteCount, ByteCount2); end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { TCustomSingleByteCodec } { } constructor TCustomSingleByteCodec.Create; begin inherited Create; FEncodeReplaceChar := AnsiChar(#32); end; constructor TCustomSingleByteCodec.CreateEx(const ErrorAction: TCodecErrorAction; const DecodeReplaceChar: WideChar; const EncodeReplaceChar: AnsiChar); begin inherited CreateEx(ErrorAction, DecodeReplaceChar); FEncodeReplaceChar := EncodeReplaceChar; end; procedure TCustomSingleByteCodec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var P : PChar; Q : PWideChar; I, L, C : Integer; begin P := Buf; Q := DestBuf; C := DestSize div Sizeof(WideChar); if not Assigned(P) or (BufSize <= 0) or not Assigned(Q) or (C <= 0) then begin ProcessedBytes := 0; DestLength := 0; exit; end; L := 0; For I := 1 to BufSize do try if L >= C then break; Q^ := DecodeChar(P^); Inc(P); Inc(Q); Inc(L); except on E : Exception do Case FErrorAction of eaException : RaiseUnicodeCodecException(E.Message, P - Buf); eaStop : break; eaSkip : Inc(P); eaIgnore : begin Q^ := WideChar(P^); Inc(P); Inc(Q); Inc(L); end; eaReplace : begin Q^ := FDecodeReplaceChar; Inc(P); Inc(Q); Inc(L); end; end; end; DestLength := L; ProcessedBytes := P - Buf; end; function TCustomSingleByteCodec.DecodeUCS4Char(const P: AnsiChar): UCS4Char; begin Result := Ord(DecodeChar(P)); end; function TCustomSingleByteCodec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var P : PChar; Q : PWideChar; I, L, M : Integer; begin Q := S; if not Assigned(Q) or (Length <= 0) then begin ProcessedChars := 0; Result := ''; exit; end; SetLength(Result, Length); L := 0; M := 0; P := Pointer(Result); For I := 1 to Length do try P^ := EncodeChar(Q^); Inc(P); Inc(Q); Inc(L); Inc(M); except on E : Exception do Case FErrorAction of eaException : RaiseUnicodeCodecException(E.Message, L); eaStop : break; eaSkip : begin Inc(Q); Inc(L); end; eaIgnore : begin P^ := Char(Q^); Inc(P); Inc(Q); Inc(L); Inc(M); end; eaReplace : begin P^ := FEncodeReplaceChar; Inc(P); Inc(Q); Inc(L); Inc(M); end; end; end; if Length <> M then SetLength(Result, M); ProcessedChars := L; end; function TCustomSingleByteCodec.EncodeUCS4Char(const Ch: UCS4Char): AnsiChar; begin if Ch < $10000 then Result := EncodeChar(WideChar(Ch)) else raise EConvertError.CreateFmt(SInvalidCodePoint, [Ch, '']); end; procedure TCustomSingleByteCodec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); var B : AnsiChar; begin if ReadBuffer(B, 1) then begin C := Ord(DecodeChar(B)); ByteCount := 1; end else begin C := UCS4_STRING_TERMINATOR; ByteCount := 0; end; end; procedure TCustomSingleByteCodec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var E : Char; begin E := EncodeUCS4Char(C); WriteBuffer(E, 1); ByteCount := 1; end; { } { UTF-8 } { } procedure TUTF8Codec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var P : PChar; Q : PWideChar; L, I : Integer; M, N : Integer; R : TUTF8Error; C : WideChar; begin P := Buf; L := BufSize; Q := DestBuf; N := DestSize div Sizeof(WideChar); if not Assigned(P) or (L <= 0) or not Assigned(Q) or (N <= 0) then begin ProcessedBytes := 0; DestLength := 0; exit; end; M := 0; Repeat if M >= N then break; try R := UTF8ToWideChar(P, L, I, C); Case R of UTF8ErrorNone : begin Q^ := C; Inc(Q); Inc(M); Inc(P, I); Dec(L, I); end; UTF8ErrorInvalidEncoding : raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-8']); UTF8ErrorIncompleteEncoding : begin ProcessedBytes := BufSize - L; DestLength := M; exit; end; UTF8ErrorOutOfRange : raise EConvertError.CreateFmt(SEncodingOutOfRange, ['UTF-8']); else raise EConvertError.CreateFmt(SUTF8Error, [Ord(R)]); end; except on E : Exception do Case FErrorAction of eaException : RaiseUnicodeCodecException(E.Message, BufSize - L); eaStop : break; eaSkip : begin Inc(P, I); Dec(L, I); end; eaIgnore : begin Q^ := C; Inc(Q); Inc(M); Inc(P, I); Dec(L, I); end; eaReplace : begin Q^ := FDecodeReplaceChar; Inc(Q); Inc(M); Inc(P, I); Dec(L, I); end; end; end; Until L <= 0; ProcessedBytes := BufSize - L; DestLength := M; end; function TUTF8Codec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var P : PWideChar; Q : PChar; I, L, M, J : Integer; begin P := S; if not Assigned(P) or (Length <= 0) then begin ProcessedChars := 0; Result := ''; exit; end; L := Length * 3; SetLength(Result, L); Q := Pointer(Result); M := 0; For I := 1 to Length do begin WideCharToUTF8(P^, Q, L, J); Inc(P); Inc(Q, J); Dec(L, J); Inc(M, J); end; SetLength(Result, M); ProcessedChars := Length; end; procedure TUTF8Codec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); const MaxCode: array[1..6] of LongWord = ($7F, $7FF, $FFFF, $1FFFFF, $3FFFFFF, $7FFFFFFF); var B, First, Mask: Byte; begin if not ReadBuffer(B, 1) then begin C := UCS4_STRING_TERMINATOR; ByteCount := 0; exit; end; C := B; ByteCount := 1; if C >= $80 then begin // UTF-8 sequence First := B; Mask := $40; if (B and $C0 <> $C0) then raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-8']); while (Mask and First <> 0) do begin if not ReadBuffer(B, 1) then raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-8']); if (B and $C0) <> $80 then raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-8']); C := (C shl 6) or (B and $3F); // Add bits to C Inc(ByteCount); // Increase sequence length Mask := Mask shr 1; // Adjust Mask end; if ByteCount > 6 then // No 0 bit in sequence header 'First' raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-8']); C := C and MaxCode[ByteCount]; // dispose of header bits // Check for invalid sequence as suggested by RFC2279 if ((ByteCount > 1) and (C <= MaxCode[ByteCount - 1])) then raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-8']); end; end; procedure TUTF8Codec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var Buffer : Array[0..3] of Byte; begin UCS4CharToUTF8(C, @Buffer, 4, ByteCount); WriteBuffer(Buffer, ByteCount); end; procedure TUTF8Codec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); const UTF8_LF : Byte = $0A; UTF8_CR : Byte = $0D; UTF8_CRLF : Array[0..1] of Byte = ($0D, $0A); begin if C = UCS4_LF then Case WriteLFOption of lwLF: begin WriteBuffer(UTF8_LF, 1); ByteCount := 1; end; lwCR: begin WriteBuffer(UTF8_CR, 1); ByteCount := 1; end; lwCRLF: begin WriteBuffer(UTF8_CRLF, 2); ByteCount := 2; end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { UTF-16BE } { } procedure TUTF16BECodec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var L, M : Integer; P, Q : PWideChar; begin L := BufSize; if L > DestSize then L := DestSize; if L <= 1 then begin ProcessedBytes := 0; DestLength := 0; exit; end; Dec(L, L mod Sizeof(WideChar)); M := L div Sizeof(WideChar); P := Buf; Q := DestBuf; Move(P^, Q^, L); DestLength := M; ProcessedBytes := L; end; function TUTF16BECodec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var L : Integer; begin if Length <= 0 then begin ProcessedChars := 0; Result := ''; exit; end; L := Length * 2; SetLength(Result, L); Move(S^, Pointer(Result)^, L); ProcessedChars := Length; end; procedure TUTF16BECodec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); var LowSurrogate: Array[0..1] of Byte; // We do not use Word, because the byte order of a Word is CPU dependant begin C := 0; // C must be initialized, because the ReadBuffer(C, 2) call below does // not fill the whole variable! if not ReadBuffer(C, 2) then begin C := UCS4_STRING_TERMINATOR; ByteCount := 0; Exit; end; C := Swap(C); // UCS4Chars are stored in Little Endian mode; so we need to swap the bytes. Case C of $D800..$DBFF: // High surrogate of Unicode character [$10000..$10FFFF] begin if not ReadBuffer(LowSurrogate, 2) then raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-16BE']); Case LowSurrogate[0] of $DC..$DF: begin C := ((C - $D7C0) shl 10) + ((LowSurrogate[0] xor $DC) shl 8) + LowSurrogate[1]; ByteCount := 4; end; else raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-16BE']); end; end; $DC00..$DFFF: // Low surrogate of Unicode character [$10000..$10FFFF] raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-16BE']); else ByteCount := 2; end; end; procedure TUTF16BECodec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var Buffer : Array[0..3] of Byte; begin UCS4CharToUTF16BE(C, @Buffer, 4, ByteCount); WriteBuffer(Buffer[0], ByteCount); end; procedure TUTF16BECodec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); const UTF16BE_LF : Array[0..1] of Byte = ($00, $0A); UTF16BE_CR : Array[0..1] of Byte = ($00, $0D); UTF16BE_CRLF : Array[0..3] of Byte = ($00, $0D, $00, $0A); begin if C = UCS4_LF then Case WriteLFOption of lwLF: begin WriteBuffer(UTF16BE_LF, 2); ByteCount := 2; end; lwCR: begin WriteBuffer(UTF16BE_CR, 2); ByteCount := 2; end; lwCRLF: begin WriteBuffer(UTF16BE_CRLF, 4); ByteCount := 4; end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { UTF-16LE } { } procedure TUTF16LECodec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var I, L, M : Integer; P, Q : PWideChar; begin L := BufSize; if L > DestSize then L := DestSize; if L <= 1 then begin ProcessedBytes := 0; DestLength := 0; exit; end; Dec(L, L mod Sizeof(WideChar)); M := L div Sizeof(WideChar); P := Buf; Q := DestBuf; For I := 1 to M do begin Q^ := SwapUTF16Endian(P^); Inc(P); Inc(Q); end; DestLength := M; ProcessedBytes := L; end; function TUTF16LECodec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var I, L : Integer; P, Q : PWideChar; begin if Length <= 0 then begin ProcessedChars := 0; Result := ''; exit; end; L := Length * 2; SetLength(Result, L); P := S; Q := Pointer(Result); For I := 1 to Length do begin Q^ := SwapUTF16Endian(P^); Inc(P); Inc(Q); end; ProcessedChars := Length; end; procedure TUTF16LECodec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); var LowSurrogate : Array[0..1] of Byte; // We do not use Word, because the byte order of a Word is CPU dependant begin C := 0; // C must be initialized, because the ReadBuffer(C, 2) call below does // not fill the whole variable! if not ReadBuffer(C, 2) then begin C := UCS4_STRING_TERMINATOR; ByteCount := 0; Exit; end; Case C of // UCS4Chars are stored in Little Endian mode; so we just can go on with it. $D800..$DBFF: // High surrogate of Unicode character [$10000..$10FFFF] begin if not ReadBuffer(LowSurrogate, 2) then raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-16LE']); Case LowSurrogate[1] of $DC..$DF: begin C := ((C - $D7C0) shl 10) + ((LowSurrogate[1] xor $DC) shl 8) + LowSurrogate[0]; ByteCount := 4; end; else raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-16LE']); end; end; $DC00..$DFFF: // Low surrogate of Unicode character [$10000..$10FFFF] raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-16LE']); else ByteCount := 2; end; end; procedure TUTF16LECodec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var Buffer : Array[0..3] of Byte; begin UCS4CharToUTF16LE(C, @Buffer, 4, ByteCount); WriteBuffer(Buffer[0], ByteCount); end; procedure TUTF16LECodec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); const UTF16LE_LF : Array[0..1] of Byte = ($0A, $00); UTF16LE_CR : Array[0..1] of Byte = ($0D, $00); UTF16LE_CRLF : Array[0..3] of Byte = ($0D, $00, $0A, $00); begin if C = UCS4_LF then Case WriteLFOption of lwLF: begin WriteBuffer(UTF16LE_LF, 2); ByteCount := 2; end; lwCR: begin WriteBuffer(UTF16LE_CR, 2); ByteCount := 2; end; lwCRLF: begin WriteBuffer(UTF16LE_CRLF, 4); ByteCount := 4; end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { TUCS4BECodec } { } procedure TUCS4BECodec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var Ch4 : UCS4Char; N, P, Q : PChar; L, C : Integer; begin P := Buf; Q := DestBuf; C := DestSize div 2; if not Assigned(P) or (BufSize <= 0) or not Assigned(Q) or (C <= 0) then begin ProcessedBytes := 0; DestLength := 0; exit; end; L := 0; N := P + BufSize - 4; While P <= N do begin Ch4 := Ord(P^) * $1000000 + Ord((P + 1)^) * $10000 + Ord((P + 2)^) * $100 + Ord((P + 3)^); if ((Ch4 >= $D800) and (Ch4 < $E000)) or (Ch4 > $10FFFF) then Case FErrorAction of eaException : if Ch4 > $10FFFF then RaiseUnicodeCodecException(SCannotConvertUCS4, [Ch4, 'UTF-16'], P - Buf) else RaiseUnicodeCodecException(SSurrogateNotAllowed, [Ch4, 'UCS-4'], P - Buf); eaStop : break; eaSkip : Inc(P, 4); eaIgnore : begin if L + 1 >= C then break; Q^ := (P + 1)^; // Nevertheless change Big Endian to Little Endian ... (Q + 1)^ := P^; (Q + 2)^ := (P + 3)^; (Q + 3)^ := (P + 2)^; Inc(Q, 4); Inc(P, 4); Inc(L, 2); end; eaReplace : begin if L >= C then break; Q^ := Char(Lo(Ord(FDecodeReplaceChar))); (Q + 1)^ := Char(Hi(Ord(FDecodeReplaceChar))); Inc(Q, 2); Inc(P, 4); Inc(L); end; end else begin if Ch4 > $FFFF then begin if L + 1 >= C then break; Q^ := Char((Ord((P + 1)^) shl 6) + (Ord((P + 2)^) shr 2)); (Q + 1)^ := Char($D8 + (Ord((P + 1)^) shr 2)); (Q + 2)^ := (P + 3)^; (Q + 3)^ := Char($DC + (3 and Ord((P + 2)^))); Inc(Q, 4); Inc(P, 4); Inc(L, 2); end else begin if L >= C then break; Q^ := (P + 3)^; (Q + 1)^ := (P + 2)^; Inc(Q, 2); Inc(P, 4); Inc(L); end; end; end; DestLength := L; ProcessedBytes := P - Buf; end; function TUCS4BECodec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var P, N : PWideChar; Q : PChar; M : Integer; HighSurrogate : Word; LowSurrogate : Word; begin P := S; if not Assigned(P) or (Length <= 0) then begin ProcessedChars := 0; Result := ''; exit; end; SetLength(Result, Length * 4); Q := Pointer(Result); M := 0; N := P + Length; While P < N do Case Ord(P^) of $D800..$DBFF: // High surrogate of Unicode character [$10000..$10FFFF] begin if P = N - 1 then // End of WideString? raise EConvertError.Create(SLowSurrogateNotFound); HighSurrogate := Ord(P^); Inc(P); Inc(M, 2); LowSurrogate := Ord(P^); Case LowSurrogate of // Low Surrogate following? $DC00..$DF00: begin Q^ := Char(0); (Q+1)^ := Char((HighSurrogate - $D7C0) shr 6); (Q+2)^ := Char(((HighSurrogate and $3F) shl 2) + ((LowSurrogate - $DC00) shr 8)); (Q+3)^ := Char(Lo(LowSurrogate)); Inc(P); Inc(Q, 4); Inc(M, 4); end; else raise EConvertError.Create(SLowSurrogateNotFound); end; end; $DC00..$DFFF: // Low surrogate of Unicode character [$10000..$10FFFF] raise EConvertError.Create(SHighSurrogateNotFound); else Q^ := Char(0); (Q+1)^ := Char(0); (Q+2)^ := Char(Hi(Ord(P^))); (Q+3)^ := Char(Lo(Ord(P^))); Inc(P); Inc(Q, 4); Inc(M, 4); end; SetLength(Result, M); ProcessedChars := Length; end; procedure TUCS4BECodec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); var B : Array[0..3] of Byte; begin if ReadBuffer(B, 4) then begin C := B[0] * $1000000 + B[1] * $10000 + B[2] * $100 + B[3]; Case C of $D800..$DFFF: // Do not accept surrogates raise EConvertError.CreateFmt(SSurrogateNotAllowed, [C, 'UCS-4BE']); end; ByteCount := 4; end else begin C := UCS4_STRING_TERMINATOR; ByteCount := 0; end; end; procedure TUCS4BECodec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var Buffer : Array[0..3] of Byte; begin Buffer[0] := C shr 24; Buffer[1] := (C and $FF0000) shr 16; Buffer[2] := (C and $FF00) shr 8; Buffer[3] := C and $FF; WriteBuffer(Buffer, 4); ByteCount := 4; end; procedure TUCS4BECodec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); const UCS4BE_LF : Array[0..3] of Byte = ($0A, $00, $00, $00); UCS4BE_CR : Array[0..3] of Byte = ($0D, $00, $00, $00); UCS4BE_CRLF : Array[0..7] of Byte = ($0D, $00, $00, $00, $0A, $00, $00, $00); begin if C = UCS4_LF then Case WriteLFOption of lwLF: begin WriteBuffer(UCS4BE_LF, 4); ByteCount := 4; end; lwCR: begin WriteBuffer(UCS4BE_CR, 4); ByteCount := 4; end; lwCRLF: begin WriteBuffer(UCS4BE_CRLF, 8); ByteCount := 8; end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { TUCS4LECodec } { } procedure TUCS4LECodec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var Ch4 : UCS4Char; N, P, Q : PChar; L, C : Integer; begin P := Buf; Q := DestBuf; C := DestSize div 2; if not Assigned(P) or (BufSize <= 0) or not Assigned(Q) or (C <= 0) then begin ProcessedBytes := 0; DestLength := 0; exit; end; L := 0; N := P + BufSize - 4; While P <= N do begin Ch4 := Ord((P + 3)^) * $1000000 + Ord((P + 2)^) * $10000 + Ord((P + 1)^) * $100 + Ord(P^); if ((Ch4 >= $D800) and (Ch4 < $E000)) or (Ch4 > $10FFFF) then Case FErrorAction of eaException : if Ch4 > $10FFFF then RaiseUnicodeCodecException(SCannotConvertUCS4, [Ch4, 'UTF-16'], P - Buf) else RaiseUnicodeCodecException(SSurrogateNotAllowed, [Ch4, 'UCS-4'], P - Buf); eaStop : break; eaSkip : Inc(P, 4); eaIgnore : begin if L + 1 >= C then break; Q^ := P^; (Q + 1)^ := (P + 1)^; (Q + 2)^ := (P + 2)^; (Q + 3)^ := (P + 3)^; Inc(Q, 4); Inc(P, 4); Inc(L, 2); end; eaReplace : begin if L >= C then break; Q^ := Char(Lo(Ord(FDecodeReplaceChar))); (Q + 1)^ := Char(Hi(Ord(FDecodeReplaceChar))); Inc(Q, 2); Inc(P, 4); Inc(L); end; end else if Ch4 > $FFFF then begin if L + 1 >= C then break; Q^ := Char((Ord((P + 2)^) shl 6) + (Ord((P + 1)^) shr 2)); (Q + 1)^ := Char($D8 + (Ord((P + 2)^) shr 2)); (Q + 2)^ := P^; (Q + 3)^ := Char($DC + (3 and Ord((P + 1)^))); Inc(Q, 4); Inc(P, 4); Inc(L, 2); end else begin if L >= C then break; Q^ := P^; (Q + 1)^ := (P + 1)^; Inc(Q, 2); Inc(P, 4); Inc(L); end; end; DestLength := L; ProcessedBytes := P - Buf; end; function TUCS4LECodec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var P, N : PWideChar; Q : PChar; M : Integer; HighSurrogate : Word; LowSurrogate : Word; begin P := S; if not Assigned(P) or (Length <= 0) then begin ProcessedChars := 0; Result := ''; exit; end; SetLength(Result, Length * 4); Q := Pointer(Result); M := 0; N := P + Length; While P < N do Case Ord(P^) of $D800..$DBFF: // High surrogate of Unicode character [$10000..$10FFFF] begin if P = N - 1 then // End of WideString? raise EConvertError.Create(SLowSurrogateNotFound); HighSurrogate := Ord(P^); Inc(P); Inc(M, 2); LowSurrogate := Ord(P^); Case LowSurrogate of // Low Surrogate following? $DC00..$DF00: begin Q^ := Char(Lo(LowSurrogate)); (Q + 1)^ := Char(((HighSurrogate and $3F) shl 2) + ((LowSurrogate - $DC00) shr 8)); (Q + 2)^ := Char((HighSurrogate - $D7C0) shr 6); (Q + 3)^ := Char(0); Inc(P); Inc(Q, 4); Inc(M, 4); end; else raise EConvertError.Create(SLowSurrogateNotFound); end; end; $DC00..$DFFF: // Low surrogate of Unicode character [$10000..$10FFFF] raise EConvertError.Create(SHighSurrogateNotFound); else Q^ := Char(0); (Q + 1)^ := Char(0); (Q + 2)^ := Char(Hi(Ord(P^))); (Q + 3)^ := Char(Lo(Ord(P^))); Inc(P); Inc(Q, 4); Inc(M, 4); end; SetLength(Result, M); ProcessedChars := Length; end; procedure TUCS4LECodec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); var B : Array[0..3] of Byte; begin if ReadBuffer(B, 4) then begin C := B[3] * $1000000 + B[2] * $10000 + B[1] * $100 + B[0]; Case C of $D800..$DFFF: // Do not accept surrogates raise EConvertError.CreateFmt(SSurrogateNotAllowed, [C, 'UCS-4LE']); end; ByteCount := 4; end else begin C := UCS4_STRING_TERMINATOR; ByteCount := 0; end; end; procedure TUCS4LECodec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var Buffer : Array[0..3] of Byte; begin Buffer[0] := C and $FF; Buffer[1] := (C and $FF00) shr 8; Buffer[2] := (C and $FF0000) shr 16; Buffer[3] := C shr 24; WriteBuffer(Buffer, 4); ByteCount := 4; end; procedure TUCS4LECodec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); const UCS4LE_LF : Array[0..3] of Byte = ($00, $00, $00, $0A); UCS4LE_CR : Array[0..3] of Byte = ($00, $00, $00, $0D); UCS4LE_CRLF : Array[0..7] of Byte = ($00, $00, $00, $0D, $00, $00, $00, $0A); begin if C = UCS4_LF then Case WriteLFOption of lwLF: begin WriteBuffer(UCS4LE_LF, 4); ByteCount := 4; end; lwCR: begin WriteBuffer(UCS4LE_CR, 4); ByteCount := 4; end; lwCRLF: begin WriteBuffer(UCS4LE_CRLF, 8); ByteCount := 8; end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { TUCS4_2143Codec } { } procedure TUCS4_2143Codec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var Ch4 : UCS4Char; N, P, Q : PChar; L, C : Integer; begin P := Buf; Q := DestBuf; C := DestSize div 2; if not Assigned(P) or (BufSize <= 0) or not Assigned(Q) or (C <= 0) then begin ProcessedBytes := 0; DestLength := 0; exit; end; L := 0; N := P + BufSize - 4; While P <= N do begin Ch4 := Ord((P + 1)^) * $1000000 + Ord(P^) * $10000 + Ord((P + 3)^) * $100 + Ord((P + 2)^); if ((Ch4 >= $D800) and (Ch4 < $E000)) or (Ch4 > $10FFFF) then Case FErrorAction of eaException : if Ch4 > $10FFFF then RaiseUnicodeCodecException(SCannotConvertUCS4, [Ch4, 'UTF-16'], P - Buf) else RaiseUnicodeCodecException(SSurrogateNotAllowed, [Ch4, 'UCS-4'], P - Buf); eaStop : break; eaSkip : Inc(P, 4); eaIgnore : begin if L + 1 >= C then break; Q^ := P^; (Q + 1)^ := (P + 1)^; (Q + 2)^ := (P + 2)^; (Q + 3)^ := (P + 3)^; Inc(Q, 4); Inc(P, 4); Inc(L, 2); end; eaReplace : begin if L >= C then break; Q^ := Char(Lo(Ord(FDecodeReplaceChar))); (Q + 1)^ := Char(Hi(Ord(FDecodeReplaceChar))); Inc(Q, 2); Inc(P, 4); Inc(L); end; end else if Ch4 > $FFFF then begin if L + 1 >= C then break; Q^ := Char((Ord(P^) shl 6) + (Ord((P + 3)^) shr 2)); (Q + 1)^ := Char($D8 + (Ord(P^) shr 2)); (Q + 2)^ := (P + 2)^; (Q + 3)^ := Char($DC + (3 and Ord((P + 3)^))); Inc(Q, 4); Inc(P, 4); Inc(L, 2); end else begin if L >= C then break; Q^ := (P + 2)^; (Q + 1)^ := (P + 3)^; Inc(Q, 2); Inc(P, 4); Inc(L); end; end; DestLength := L; ProcessedBytes := P - Buf; end; function TUCS4_2143Codec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var P, N : PWideChar; Q : PChar; M : Integer; HighSurrogate : Word; LowSurrogate : Word; begin P := S; if not Assigned(P) or (Length <= 0) then begin ProcessedChars := 0; Result := ''; exit; end; SetLength(Result, Length * 4); Q := Pointer(Result); M := 0; N := P + Length; While P < N do begin Case Ord(P^) of $D800..$DBFF: // High surrogate of Unicode character [$10000..$10FFFF] begin if P = N - 1 then // End of WideString? raise EConvertError.Create(SLowSurrogateNotFound); HighSurrogate := Ord(P^); Inc(P); Inc(M, 2); LowSurrogate := Ord(P^); Case LowSurrogate of // Low Surrogate following? $DC00..$DF00: begin Q^ := Char((HighSurrogate - $D7C0) shr 6); (Q + 1)^ := Char(0); (Q + 2)^ := Char(Lo(LowSurrogate)); (Q + 3)^ := Char(((HighSurrogate and $3F) shl 2) + ((LowSurrogate - $DC00) shr 8)); Inc(P); Inc(Q, 4); Inc(M, 4); end; else raise EConvertError.Create(SLowSurrogateNotFound); end; end; $DC00..$DFFF: // Low surrogate of Unicode character [$10000..$10FFFF] raise EConvertError.Create(SHighSurrogateNotFound); else Q^ := Char(0); (Q + 1)^ := Char(0); (Q + 2)^ := Char(Lo(Ord(P^))); (Q + 3)^ := Char(Hi(Ord(P^))); Inc(P); Inc(Q, 4); Inc(M, 4); end; end; SetLength(Result, M); ProcessedChars := Length; end; procedure TUCS4_2143Codec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); var B : Array[0..3] of Byte; begin if ReadBuffer(B, 4) then begin C := B[1] * $1000000 + B[0] * $10000 + B[3] * $100 + B[2]; Case C of $D800..$DFFF: // Do not accept surrogates raise EConvertError.CreateFmt(SSurrogateNotAllowed, [C, 'UCS-4']); end; end else C := UCS4_STRING_TERMINATOR; ByteCount := 4; end; procedure TUCS4_2143Codec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var Buffer : Array[0..3] of Byte; begin Buffer[0] := (C and $FF0000) shr 16; Buffer[1] := C shr 24; Buffer[2] := C and $FF; Buffer[3] := (C and $FF00) shr 8; WriteBuffer(Buffer, 4); ByteCount := 4; end; procedure TUCS4_2143Codec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); const UCS4_2143_LF : Array[0..3] of Byte = ($00, $0A, $00, $00); UCS4_2143_CR : Array[0..3] of Byte = ($00, $0D, $00, $00); UCS4_2143_CRLF : Array[0..7] of Byte = ($00, $0D, $00, $00, $00, $0A, $00, $00); begin if C = UCS4_LF then Case WriteLFOption of lwLF: begin WriteBuffer(UCS4_2143_LF, 4); ByteCount := 4; end; lwCR: begin WriteBuffer(UCS4_2143_CR, 4); ByteCount := 4; end; lwCRLF: begin WriteBuffer(UCS4_2143_CRLF, 8); ByteCount := 8; end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { TUCS4_3412Codec } { } procedure TUCS4_3412Codec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var Ch4 : UCS4Char; N, P, Q : PChar; L, C : Integer; begin P := Buf; Q := DestBuf; C := DestSize div 2; if not Assigned(P) or (BufSize <= 0) or not Assigned(Q) or (C <= 0) then begin ProcessedBytes := 0; DestLength := 0; exit; end; L := 0; N := P + BufSize - 4; While P <= N do begin Ch4 := Ord((P + 2)^) * $1000000 + Ord((P + 3)^) * $10000 + Ord(P^) * $100 + Ord((P + 1)^); if ((Ch4 >= $D800) and (Ch4 < $E000)) or (Ch4 > $10FFFF) then Case FErrorAction of eaException : if Ch4 > $10FFFF then RaiseUnicodeCodecException(SCannotConvertUCS4, [Ch4, 'UTF-16'], P - Buf) else RaiseUnicodeCodecException(SSurrogateNotAllowed, [Ch4, 'UCS-4'], P - Buf); eaStop : break; eaSkip : Inc(P, 4); eaIgnore : begin if L + 1 >= C then break; Q^ := (P + 1)^; // Nevertheless change Big Endian to Little Endian ... (Q + 1)^ := P^; (Q + 2)^ := (P + 3)^; (Q + 3)^ := (P + 2)^; Inc(Q, 4); Inc(P, 4); Inc(L, 2); end; eaReplace : begin if L >= C then break; Q^ := Char(Lo(Ord(FDecodeReplaceChar))); (Q + 1)^ := Char(Hi(Ord(FDecodeReplaceChar))); Inc(Q, 2); Inc(P, 4); Inc(L); end; end else if Ch4 > $FFFF then begin if L + 1 >= C then break; Q^ := Char((Ord((P + 3)^) shl 6) + (Ord(P^) shr 2)); (Q + 1)^ := Char($D8 + (Ord((P + 3)^) shr 2)); (Q + 2)^ := (P + 1)^; (Q + 3)^ := Char($DC + (3 and Ord(P^))); Inc(Q, 4); Inc(P, 4); Inc(L, 2); end else begin if L >= C then break; Q^ := (P + 1)^; (Q + 1)^ := P^; Inc(Q, 2); Inc(P, 4); Inc(L); end; end; DestLength := L; ProcessedBytes := P - Buf; end; function TUCS4_3412Codec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var P, N : PWideChar; Q : PChar; M : Integer; HighSurrogate : Word; LowSurrogate : Word; begin P := S; if not Assigned(P) or (Length <= 0) then begin ProcessedChars := 0; Result := ''; exit; end; SetLength(Result, Length * 4); Q := Pointer(Result); M := 0; N := P + Length; While P < N do Case Ord(P^) of $D800..$DBFF: // High surrogate of Unicode character [$10000..$10FFFF] begin if P = N - 1 then // End of WideString? raise EConvertError.Create(SLowSurrogateNotFound); HighSurrogate := Ord(P^); Inc(P); Inc(M, 2); LowSurrogate := Ord(P^); Case LowSurrogate of // Low Surrogate following? $DC00..$DF00: begin Q^ := Char(((HighSurrogate and $3F) shl 2) + ((LowSurrogate - $DC00) shr 8)); (Q + 1)^ := Char(Lo(LowSurrogate)); (Q + 2)^ := Char(0); (Q + 3)^ := Char((HighSurrogate - $D7C0) shr 6); Inc(P); Inc(Q, 4); Inc(M, 4); end; else raise EConvertError.Create(SLowSurrogateNotFound); end; end; $DC00..$DFFF: // Low surrogate of Unicode character [$10000..$10FFFF] raise EConvertError.Create(SHighSurrogateNotFound); else Q^ := Char(Hi(Ord(P^))); (Q + 1)^ := Char(Lo(Ord(P^))); (Q + 2)^ := Char(0); (Q + 3)^ := Char(0); Inc(P); Inc(Q, 4); Inc(M, 4); end; SetLength(Result, M); ProcessedChars := Length; end; procedure TUCS4_3412Codec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); var B : Array[0..3] of Byte; begin if ReadBuffer(B, 4) then begin C := B[2] * $1000000 + B[3] * $10000 + B[0] * $100 + B[1]; Case C of $D800..$DFFF: // Do not accept surrogates raise EConvertError.CreateFmt(SSurrogateNotAllowed, [C, 'UCS-4']); end; ByteCount := 4; end else begin C := UCS4_STRING_TERMINATOR; ByteCount := 0; end; end; procedure TUCS4_3412Codec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var Buffer : Array[0..3] of Byte; begin Buffer[0] := (C and $FF00) shr 8; Buffer[1] := C and $FF; Buffer[2] := C shr 24; Buffer[3] := (C and $FF0000) shr 16; WriteBuffer(Buffer, 4); ByteCount := 4; end; procedure TUCS4_3412Codec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); const UCS4_3412_LF : Array[0..3] of Byte = ($00, $00, $0A, $00); UCS4_3412_CR : Array[0..3] of Byte = ($00, $00, $0D, $00); UCS4_3412_CRLF : Array[0..7] of Byte = ($00, $00, $0D, $00, $00, $00, $0A, $00); begin if C = UCS4_LF then Case WriteLFOption of lwLF: begin WriteBuffer(UCS4_3412_LF, 4); ByteCount := 4; end; lwCR: begin WriteBuffer(UCS4_3412_CR, 4); ByteCount := 4; end; lwCRLF: begin WriteBuffer(UCS4_3412_CRLF, 8); ByteCount := 8; end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { TUCS2Codec } { } procedure TUCS2Codec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var P : PWideChar; Q : PWideChar; I, L, C : Integer; begin P := Buf; Q := DestBuf; C := DestSize div Sizeof(WideChar); if not Assigned(P) or (BufSize <= 0) or not Assigned(Q) or (C <= 0) then begin ProcessedBytes := 0; DestLength := 0; exit; end; L := 0; For I := 1 to BufSize do if (Ord(P^) >= $D800) and (Ord(P^) < $E000) then // Do not accept surrogates Case FErrorAction of eaException : RaiseUnicodeCodecException(SSurrogateNotAllowed, [Ord(P^), 'UCS-2'], P - Buf); eaStop : break; eaSkip : Inc(P); eaIgnore : begin Q^ := WideChar(P^); Inc(P); Inc(Q); Inc(L); end; eaReplace : begin Q^ := FDecodeReplaceChar; Inc(P); Inc(Q); Inc(L); end; end else begin if L >= C then break; Q^ := P^; Inc(P); Inc(Q); Inc(L); end; DestLength := L; ProcessedBytes := P - Buf; end; function TUCS2Codec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var P : PWideChar; Q : PWideChar; I, L, M : Integer; begin Q := S; if not Assigned(Q) or (Length <= 0) then begin ProcessedChars := 0; Result := ''; exit; end; SetLength(Result, Length*2); L := 0; M := 0; P := Pointer(Result); For I := 1 to Length do if (Ord(P^) >= $D800) and (Ord(P^) < $E000) then // Do not accept surrogates Case FErrorAction of eaException : RaiseUnicodeCodecException(SSurrogateNotAllowed, [Ord(P^), 'UCS-2'], L * 2); eaStop : break; eaSkip : begin Inc(Q); Inc(L); end; eaIgnore : begin P^ := Q^; Inc(P); Inc(Q); Inc(L); Inc(M); end; eaReplace : begin P^ := FDecodeReplaceChar; Inc(P); Inc(Q); Inc(L); Inc(M); end; end else begin P^ := Q^; Inc(P); Inc(Q); Inc(L); Inc(M); end; if Length <> M then SetLength(Result, M * 2); ProcessedChars := L; end; procedure TUCS2Codec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); begin C := 0; // C must be initialized, because the ReadBuffer(C, 2) call below does // not fill the whole variable! if not ReadBuffer(C, 2) then begin C := UCS4_STRING_TERMINATOR; ByteCount := 0; exit; end; C := Swap(C); // UCS4Chars are stored in Little Endian mode; so we need to swap the bytes. ByteCount := 2; Case C of $D800..$DFFF: // Do not accept surrogates raise EConvertError.CreateFmt(SSurrogateNotAllowed, [C, 'UCS-2']); end; end; procedure TUCS2Codec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var HighByte, LowByte: Byte; begin if C > $FFFF then raise EConvertError.CreateFmt(SEncodingOutOfRange, ['UCS-2']); HighByte := Hi(C); LowByte := Lo(C); WriteBuffer(HighByte, 1); WriteBuffer(LowByte, 1); ByteCount := 2; end; procedure TUCS2Codec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); const UCS2_LF : Array[0..1] of Byte = ($00, $0A); UCS2_CR : Array[0..1] of Byte = ($00, $0D); UCS2_CRLF : Array[0..3] of Byte = ($00, $0D, $00, $0A); begin if C = UCS4_LF then Case WriteLFOption of lwLF: begin WriteBuffer(UCS2_LF, 2); ByteCount := 2; end; lwCR: begin WriteBuffer(UCS2_CR, 2); ByteCount := 2; end; lwCRLF: begin WriteBuffer(UCS2_CRLF, 4); ByteCount := 4; end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { ISO-8859-1 - Latin 1 } { } function TISO8859_1Codec.DecodeChar(const P: AnsiChar): WideChar; begin Result := WideChar(P); end; function TISO8859_1Codec.EncodeChar(const Ch: WideChar): AnsiChar; begin if Ord(Ch) >= $100 then raise EConvertError.CreateFmt(SCannotConvert, [Ord(Ch), 'ISO-8859-1']); Result := AnsiChar(Ch); end; procedure TISO8859_1Codec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var L, C: Integer; begin L := BufSize; C := DestSize div Sizeof(WideChar); if C < L then L := C; if L < 0 then L := 0; ProcessedBytes := L; DestLength := L; LongToWide(Buf, L, DestBuf); end; { } { ISO-8859-2 Latin 2 } { } const ISO8859_2Map : AnsiCharISOMap = ( #$00A0, #$0104, #$02D8, #$0141, #$00A4, #$013D, #$015A, #$00A7, #$00A8, #$0160, #$015E, #$0164, #$0179, #$00AD, #$017D, #$017B, #$00B0, #$0105, #$02DB, #$0142, #$00B4, #$013E, #$015B, #$02C7, #$00B8, #$0161, #$015F, #$0165, #$017A, #$02DD, #$017E, #$017C, #$0154, #$00C1, #$00C2, #$0102, #$00C4, #$0139, #$0106, #$00C7, #$010C, #$00C9, #$0118, #$00CB, #$011A, #$00CD, #$00CE, #$010E, #$0110, #$0143, #$0147, #$00D3, #$00D4, #$0150, #$00D6, #$00D7, #$0158, #$016E, #$00DA, #$0170, #$00DC, #$00DD, #$0162, #$00DF, #$0155, #$00E1, #$00E2, #$0103, #$00E4, #$013A, #$0107, #$00E7, #$010D, #$00E9, #$0119, #$00EB, #$011B, #$00ED, #$00EE, #$010F, #$0111, #$0144, #$0148, #$00F3, #$00F4, #$0151, #$00F6, #$00F7, #$0159, #$016F, #$00FA, #$0171, #$00FC, #$00FD, #$0163, #$02D9); function TISO8859_2Codec.DecodeChar(const P: AnsiChar): WideChar; begin if Ord(P) < $A0 then Result := WideChar(P) else Result := ISO8859_2Map[P]; end; function TISO8859_2Codec.EncodeChar(const Ch: WideChar): AnsiChar; begin Result := CharFromISOMap(Ch, ISO8859_2Map, 'ISO-8859-2'); end; function GetSystemEncodingCodecClass: TUnicodeCodecClass; begin Case GetACP of 874 : Result := TWindows874Codec; // Thai 932 : Result := nil; // Japan -- Not supported 936 : Result := nil; // Chinese (PRC, Singapore) -- Not supported 949 : Result := nil; // Korean -- Not supported 950 : Result := nil; // Chinese (Taiwan, Hong Kong) -- Not supported 1200 : Result := nil; // Unicode (BMP of ISO 10646) -- Not supported 1250 : Result := TWindows1250Codec; // Windows 3.1 Eastern European 1251 : Result := TWindows1251Codec; // Windows 3.1 Cyrillic 1252 : Result := TWindows1252Codec; // Windows 3.1 Latin 1 (US, Western Europe) 1253 : Result := TWindows1253Codec; // Windows 3.1 Greek 1254 : Result := TWindows1254Codec; // Windows 3.1 Turkish 1255 : Result := TWindows1255Codec; // Hebrew 1256 : Result := TWindows1256Codec; // Arabic 1257 : Result := TWindows1257Codec; // Baltic else Result := nil; end; end; {$ENDIF} { } { Encoding detection } { } function DetectUTFEncoding(const Buf: Pointer; const BufSize: Integer; var BOMSize: Integer): TUnicodeCodecClass; var R : Boolean; begin if DetectUTF16BOM(Buf, BufSize, R) then begin BOMSize := UTF16BOMSize; if R then Result := TUTF16LECodec else Result := TUTF16BECodec end else if DetectUTF8BOM(Buf, BufSize) then begin BOMSize := UTF8BOMSize; Result := TUTF8Codec; end else begin BOMSize := 0; Result := nil; end; end; { } { Unicode conversion functions } { } function EncodingToUTF16(const CodecClass: TUnicodeCodecClass; const Buf: Pointer; const BufSize: Integer): WideString; var C : TCustomUnicodeCodec; begin if not Assigned(CodecClass) then begin Result := ''; exit; end; C := CodecClass.Create; try C.DecodeStr(Buf, BufSize, Result); finally C.Free; end; end; function EncodingToUTF16(const CodecClass: TUnicodeCodecClass; const S: String): WideString; var C : TCustomUnicodeCodec; begin if not Assigned(CodecClass) then begin Result := ''; exit; end; C := CodecClass.Create; try C.DecodeStr(PChar(S), Length(S), Result); finally C.Free; end; end; function EncodingToUTF16(const CodecAlias: String; const Buf: Pointer; const BufSize: Integer): WideString; begin Result := EncodingToUTF16(GetCodecClassByAlias(CodecAlias), Buf, BufSize); end; function EncodingToUTF16(const CodecAlias, S: String): WideString; begin Result := EncodingToUTF16(GetCodecClassByAlias(CodecAlias), S); end; function UTF16ToEncoding(const CodecClass: TUnicodeCodecClass; const S: WideString): String; var C : TCustomUnicodeCodec; I : Integer; begin if not Assigned(CodecClass) then begin Result := ''; exit; end; C := CodecClass.Create; try Result := C.Encode(Pointer(S), Length(S), I); finally C.Free; end; end; function UTF16ToEncoding(const CodecAlias: String; const S: WideString): String; begin Result := UTF16ToEncoding(GetCodecClassByAlias(CodecAlias), S); end; { } { EUnicodeCodecException helper functions } { } procedure RaiseUnicodeCodecException(const Msg: String; const ProcessedBytes: Integer); overload; var E : EUnicodeCodecException; begin E := EUnicodeCodecException.Create(Msg); E.ProcessedBytes := ProcessedBytes; raise E; end; procedure RaiseUnicodeCodecException(const Msg: string; const Args: array of const; const ProcessedBytes: Integer); overload; var E : EUnicodeCodecException; begin E := EUnicodeCodecException.CreateFmt(Msg, Args); E.ProcessedBytes := ProcessedBytes; end; { } { TCustomUnicodeCodec } { } constructor TCustomUnicodeCodec.Create; begin inherited Create; FDecodeReplaceChar := WideChar(#$FFFD); FErrorAction := eaException; FReadLFOption := lrPass; FWriteLFOption := lwLF; ResetReadAhead; end; constructor TCustomUnicodeCodec.CreateEx(const AErrorAction: TCodecErrorAction; const ADecodeReplaceChar: WideChar; const AReadLFOption: TCodecReadLFOption; const AWriteLFOption: TCodecWriteLFOption); begin inherited Create; FErrorAction := AErrorAction; FDecodeReplaceChar := ADecodeReplaceChar; FReadLFOption := AReadLFOption; FWriteLFOption := AWriteLFOption; ResetReadAhead; end; procedure TCustomUnicodeCodec.ResetReadAhead; begin FReadAhead := False; FReadAheadBuffer := 0; end; procedure TCustomUnicodeCodec.SetDecodeReplaceChar(const Value: WideChar); begin FDecodeReplaceChar := Value; end; procedure TCustomUnicodeCodec.SetErrorAction(const Value: TCodecErrorAction); begin FErrorAction := Value; end; procedure TCustomUnicodeCodec.SetReadLFOption(const Value: TCodecReadLFOption); begin FReadLFOption := Value; end; procedure TCustomUnicodeCodec.SetWriteLFOption(const Value: TCodecWriteLFOption); begin FWriteLFOption := Value; end; procedure TCustomUnicodeCodec.SetOnRead(const Value: TCodecReadEvent); begin if @Value <> @FOnRead then begin ResetReadAhead; FOnRead := Value; end; end; procedure TCustomUnicodeCodec.DecodeStr(const Buf: Pointer; const BufSize: Integer; var Dest: WideString); var P : PChar; Q : PWideChar; L, M : Integer; I, J : Integer; begin P := Buf; L := BufSize; if not Assigned(P) or (L <= 0) then begin Dest := ''; exit; end; SetLength(Dest, BufSize); M := 0; Repeat Q := Pointer(Dest); Inc(Q, M); Decode(P, L, Q, BufSize * Sizeof(WideChar), I, J); Dec(L, I); Inc(P, I); Inc(M, J); if (J < BufSize) or (L <= 0) then break; SetLength(Dest, M + BufSize); Until False; if Length(Dest) <> M then SetLength(Dest, M); end; function TCustomUnicodeCodec.EncodeStr(const S: WideString): String; var I : Integer; begin Result := Encode(Pointer(S), Length(S), I); end; function TCustomUnicodeCodec.ReadBuffer(var Buf; Count: Integer): Boolean; begin Result := False; if Assigned(FOnRead) then FOnRead(self, Buf, Count, Result); end; procedure TCustomUnicodeCodec.WriteBuffer(const Buf; Count: Integer); begin if Assigned(FOnWrite) then FOnWrite(self, Buf, Count); end; procedure TCustomUnicodeCodec.ReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); begin // Get UCS4 character from read-ahead buffer or from InternalReadUCS4Char if FReadAhead then begin C := FReadAheadBuffer; ByteCount := FReadAheadByteCount; FReadAhead := False; end else InternalReadUCS4Char(C, ByteCount); // Adjust line breaks to Linux-style breaks with a single LINE FEED character if (C = UCS4_CR) and (ReadLFOption = lrNormalize) then begin InternalReadUCS4Char(FReadAheadBuffer, FReadAheadByteCount); if FReadAheadBuffer = UCS4_LF then Inc(ByteCount, FReadAheadByteCount) else FReadAhead := True; C := UCS4_LF; end; end; procedure TCustomUnicodeCodec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var ByteCount2 : Integer; begin if C = UCS4_LF then // Transform LINE FEED character Case WriteLFOption of lwLF : InternalWriteUCS4Char(UCS4_LF, ByteCount); lwCR : InternalWriteUCS4Char(UCS4_CR, ByteCount); lwCRLF : begin InternalWriteUCS4Char(UCS4_CR, ByteCount); InternalWriteUCS4Char(UCS4_LF, ByteCount2); Inc(ByteCount, ByteCount2); end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { TCustomSingleByteCodec } { } constructor TCustomSingleByteCodec.Create; begin inherited Create; FEncodeReplaceChar := AnsiChar(#32); end; constructor TCustomSingleByteCodec.CreateEx(const ErrorAction: TCodecErrorAction; const DecodeReplaceChar: WideChar; const EncodeReplaceChar: AnsiChar); begin inherited CreateEx(ErrorAction, DecodeReplaceChar); FEncodeReplaceChar := EncodeReplaceChar; end; procedure TCustomSingleByteCodec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var P : PChar; Q : PWideChar; I, L, C : Integer; begin P := Buf; Q := DestBuf; C := DestSize div Sizeof(WideChar); if not Assigned(P) or (BufSize <= 0) or not Assigned(Q) or (C <= 0) then begin ProcessedBytes := 0; DestLength := 0; exit; end; L := 0; For I := 1 to BufSize do try if L >= C then break; Q^ := DecodeChar(P^); Inc(P); Inc(Q); Inc(L); except on E : Exception do Case FErrorAction of eaException : RaiseUnicodeCodecException(E.Message, P - Buf); eaStop : break; eaSkip : Inc(P); eaIgnore : begin Q^ := WideChar(P^); Inc(P); Inc(Q); Inc(L); end; eaReplace : begin Q^ := FDecodeReplaceChar; Inc(P); Inc(Q); Inc(L); end; end; end; DestLength := L; ProcessedBytes := P - Buf; end; function TCustomSingleByteCodec.DecodeUCS4Char(const P: AnsiChar): UCS4Char; begin Result := Ord(DecodeChar(P)); end; function TCustomSingleByteCodec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var P : PChar; Q : PWideChar; I, L, M : Integer; begin Q := S; if not Assigned(Q) or (Length <= 0) then begin ProcessedChars := 0; Result := ''; exit; end; SetLength(Result, Length); L := 0; M := 0; P := Pointer(Result); For I := 1 to Length do try P^ := EncodeChar(Q^); Inc(P); Inc(Q); Inc(L); Inc(M); except on E : Exception do Case FErrorAction of eaException : RaiseUnicodeCodecException(E.Message, L); eaStop : break; eaSkip : begin Inc(Q); Inc(L); end; eaIgnore : begin P^ := Char(Q^); Inc(P); Inc(Q); Inc(L); Inc(M); end; eaReplace : begin P^ := FEncodeReplaceChar; Inc(P); Inc(Q); Inc(L); Inc(M); end; end; end; if Length <> M then SetLength(Result, M); ProcessedChars := L; end; function TCustomSingleByteCodec.EncodeUCS4Char(const Ch: UCS4Char): AnsiChar; begin if Ch < $10000 then Result := EncodeChar(WideChar(Ch)) else raise EConvertError.CreateFmt(SInvalidCodePoint, [Ch, '']); end; procedure TCustomSingleByteCodec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); var B : AnsiChar; begin if ReadBuffer(B, 1) then begin C := Ord(DecodeChar(B)); ByteCount := 1; end else begin C := UCS4_STRING_TERMINATOR; ByteCount := 0; end; end; procedure TCustomSingleByteCodec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var E : Char; begin E := EncodeUCS4Char(C); WriteBuffer(E, 1); ByteCount := 1; end; { } { UTF-8 } { } procedure TUTF8Codec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var P : PChar; Q : PWideChar; L, I : Integer; M, N : Integer; R : TUTF8Error; C : WideChar; begin P := Buf; L := BufSize; Q := DestBuf; N := DestSize div Sizeof(WideChar); if not Assigned(P) or (L <= 0) or not Assigned(Q) or (N <= 0) then begin ProcessedBytes := 0; DestLength := 0; exit; end; M := 0; Repeat if M >= N then break; try R := UTF8ToWideChar(P, L, I, C); Case R of UTF8ErrorNone : begin Q^ := C; Inc(Q); Inc(M); Inc(P, I); Dec(L, I); end; UTF8ErrorInvalidEncoding : raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-8']); UTF8ErrorIncompleteEncoding : begin ProcessedBytes := BufSize - L; DestLength := M; exit; end; UTF8ErrorOutOfRange : raise EConvertError.CreateFmt(SEncodingOutOfRange, ['UTF-8']); else raise EConvertError.CreateFmt(SUTF8Error, [Ord(R)]); end; except on E : Exception do Case FErrorAction of eaException : RaiseUnicodeCodecException(E.Message, BufSize - L); eaStop : break; eaSkip : begin Inc(P, I); Dec(L, I); end; eaIgnore : begin Q^ := C; Inc(Q); Inc(M); Inc(P, I); Dec(L, I); end; eaReplace : begin Q^ := FDecodeReplaceChar; Inc(Q); Inc(M); Inc(P, I); Dec(L, I); end; end; end; Until L <= 0; ProcessedBytes := BufSize - L; DestLength := M; end; function TUTF8Codec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var P : PWideChar; Q : PChar; I, L, M, J : Integer; begin P := S; if not Assigned(P) or (Length <= 0) then begin ProcessedChars := 0; Result := ''; exit; end; L := Length * 3; SetLength(Result, L); Q := Pointer(Result); M := 0; For I := 1 to Length do begin WideCharToUTF8(P^, Q, L, J); Inc(P); Inc(Q, J); Dec(L, J); Inc(M, J); end; SetLength(Result, M); ProcessedChars := Length; end; procedure TUTF8Codec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); const MaxCode: array[1..6] of LongWord = ($7F, $7FF, $FFFF, $1FFFFF, $3FFFFFF, $7FFFFFFF); var B, First, Mask: Byte; begin if not ReadBuffer(B, 1) then begin C := UCS4_STRING_TERMINATOR; ByteCount := 0; exit; end; C := B; ByteCount := 1; if C >= $80 then begin // UTF-8 sequence First := B; Mask := $40; if (B and $C0 <> $C0) then raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-8']); while (Mask and First <> 0) do begin if not ReadBuffer(B, 1) then raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-8']); if (B and $C0) <> $80 then raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-8']); C := (C shl 6) or (B and $3F); // Add bits to C Inc(ByteCount); // Increase sequence length Mask := Mask shr 1; // Adjust Mask end; if ByteCount > 6 then // No 0 bit in sequence header 'First' raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-8']); C := C and MaxCode[ByteCount]; // dispose of header bits // Check for invalid sequence as suggested by RFC2279 if ((ByteCount > 1) and (C <= MaxCode[ByteCount - 1])) then raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-8']); end; end; procedure TUTF8Codec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var Buffer : Array[0..3] of Byte; begin UCS4CharToUTF8(C, @Buffer, 4, ByteCount); WriteBuffer(Buffer, ByteCount); end; procedure TUTF8Codec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); const UTF8_LF : Byte = $0A; UTF8_CR : Byte = $0D; UTF8_CRLF : Array[0..1] of Byte = ($0D, $0A); begin if C = UCS4_LF then Case WriteLFOption of lwLF: begin WriteBuffer(UTF8_LF, 1); ByteCount := 1; end; lwCR: begin WriteBuffer(UTF8_CR, 1); ByteCount := 1; end; lwCRLF: begin WriteBuffer(UTF8_CRLF, 2); ByteCount := 2; end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { UTF-16BE } { } procedure TUTF16BECodec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var L, M : Integer; P, Q : PWideChar; begin L := BufSize; if L > DestSize then L := DestSize; if L <= 1 then begin ProcessedBytes := 0; DestLength := 0; exit; end; Dec(L, L mod Sizeof(WideChar)); M := L div Sizeof(WideChar); P := Buf; Q := DestBuf; Move(P^, Q^, L); DestLength := M; ProcessedBytes := L; end; function TUTF16BECodec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var L : Integer; begin if Length <= 0 then begin ProcessedChars := 0; Result := ''; exit; end; L := Length * 2; SetLength(Result, L); Move(S^, Pointer(Result)^, L); ProcessedChars := Length; end; procedure TUTF16BECodec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); var LowSurrogate: Array[0..1] of Byte; // We do not use Word, because the byte order of a Word is CPU dependant begin C := 0; // C must be initialized, because the ReadBuffer(C, 2) call below does // not fill the whole variable! if not ReadBuffer(C, 2) then begin C := UCS4_STRING_TERMINATOR; ByteCount := 0; Exit; end; C := Swap(C); // UCS4Chars are stored in Little Endian mode; so we need to swap the bytes. Case C of $D800..$DBFF: // High surrogate of Unicode character [$10000..$10FFFF] begin if not ReadBuffer(LowSurrogate, 2) then raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-16BE']); Case LowSurrogate[0] of $DC..$DF: begin C := ((C - $D7C0) shl 10) + ((LowSurrogate[0] xor $DC) shl 8) + LowSurrogate[1]; ByteCount := 4; end; else raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-16BE']); end; end; $DC00..$DFFF: // Low surrogate of Unicode character [$10000..$10FFFF] raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-16BE']); else ByteCount := 2; end; end; procedure TUTF16BECodec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var Buffer : Array[0..3] of Byte; begin UCS4CharToUTF16BE(C, @Buffer, 4, ByteCount); WriteBuffer(Buffer[0], ByteCount); end; procedure TUTF16BECodec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); const UTF16BE_LF : Array[0..1] of Byte = ($00, $0A); UTF16BE_CR : Array[0..1] of Byte = ($00, $0D); UTF16BE_CRLF : Array[0..3] of Byte = ($00, $0D, $00, $0A); begin if C = UCS4_LF then Case WriteLFOption of lwLF: begin WriteBuffer(UTF16BE_LF, 2); ByteCount := 2; end; lwCR: begin WriteBuffer(UTF16BE_CR, 2); ByteCount := 2; end; lwCRLF: begin WriteBuffer(UTF16BE_CRLF, 4); ByteCount := 4; end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { UTF-16LE } { } procedure TUTF16LECodec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var I, L, M : Integer; P, Q : PWideChar; begin L := BufSize; if L > DestSize then L := DestSize; if L <= 1 then begin ProcessedBytes := 0; DestLength := 0; exit; end; Dec(L, L mod Sizeof(WideChar)); M := L div Sizeof(WideChar); P := Buf; Q := DestBuf; For I := 1 to M do begin Q^ := SwapUTF16Endian(P^); Inc(P); Inc(Q); end; DestLength := M; ProcessedBytes := L; end; function TUTF16LECodec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var I, L : Integer; P, Q : PWideChar; begin if Length <= 0 then begin ProcessedChars := 0; Result := ''; exit; end; L := Length * 2; SetLength(Result, L); P := S; Q := Pointer(Result); For I := 1 to Length do begin Q^ := SwapUTF16Endian(P^); Inc(P); Inc(Q); end; ProcessedChars := Length; end; procedure TUTF16LECodec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); var LowSurrogate : Array[0..1] of Byte; // We do not use Word, because the byte order of a Word is CPU dependant begin C := 0; // C must be initialized, because the ReadBuffer(C, 2) call below does // not fill the whole variable! if not ReadBuffer(C, 2) then begin C := UCS4_STRING_TERMINATOR; ByteCount := 0; Exit; end; Case C of // UCS4Chars are stored in Little Endian mode; so we just can go on with it. $D800..$DBFF: // High surrogate of Unicode character [$10000..$10FFFF] begin if not ReadBuffer(LowSurrogate, 2) then raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-16LE']); Case LowSurrogate[1] of $DC..$DF: begin C := ((C - $D7C0) shl 10) + ((LowSurrogate[1] xor $DC) shl 8) + LowSurrogate[0]; ByteCount := 4; end; else raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-16LE']); end; end; $DC00..$DFFF: // Low surrogate of Unicode character [$10000..$10FFFF] raise EConvertError.CreateFmt(SInvalidEncoding, ['UTF-16LE']); else ByteCount := 2; end; end; procedure TUTF16LECodec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var Buffer : Array[0..3] of Byte; begin UCS4CharToUTF16LE(C, @Buffer, 4, ByteCount); WriteBuffer(Buffer[0], ByteCount); end; procedure TUTF16LECodec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); const UTF16LE_LF : Array[0..1] of Byte = ($0A, $00); UTF16LE_CR : Array[0..1] of Byte = ($0D, $00); UTF16LE_CRLF : Array[0..3] of Byte = ($0D, $00, $0A, $00); begin if C = UCS4_LF then Case WriteLFOption of lwLF: begin WriteBuffer(UTF16LE_LF, 2); ByteCount := 2; end; lwCR: begin WriteBuffer(UTF16LE_CR, 2); ByteCount := 2; end; lwCRLF: begin WriteBuffer(UTF16LE_CRLF, 4); ByteCount := 4; end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { TUCS4BECodec } { } procedure TUCS4BECodec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var Ch4 : UCS4Char; N, P, Q : PChar; L, C : Integer; begin P := Buf; Q := DestBuf; C := DestSize div 2; if not Assigned(P) or (BufSize <= 0) or not Assigned(Q) or (C <= 0) then begin ProcessedBytes := 0; DestLength := 0; exit; end; L := 0; N := P + BufSize - 4; While P <= N do begin Ch4 := Ord(P^) * $1000000 + Ord((P + 1)^) * $10000 + Ord((P + 2)^) * $100 + Ord((P + 3)^); if ((Ch4 >= $D800) and (Ch4 < $E000)) or (Ch4 > $10FFFF) then Case FErrorAction of eaException : if Ch4 > $10FFFF then RaiseUnicodeCodecException(SCannotConvertUCS4, [Ch4, 'UTF-16'], P - Buf) else RaiseUnicodeCodecException(SSurrogateNotAllowed, [Ch4, 'UCS-4'], P - Buf); eaStop : break; eaSkip : Inc(P, 4); eaIgnore : begin if L + 1 >= C then break; Q^ := (P + 1)^; // Nevertheless change Big Endian to Little Endian ... (Q + 1)^ := P^; (Q + 2)^ := (P + 3)^; (Q + 3)^ := (P + 2)^; Inc(Q, 4); Inc(P, 4); Inc(L, 2); end; eaReplace : begin if L >= C then break; Q^ := Char(Lo(Ord(FDecodeReplaceChar))); (Q + 1)^ := Char(Hi(Ord(FDecodeReplaceChar))); Inc(Q, 2); Inc(P, 4); Inc(L); end; end else begin if Ch4 > $FFFF then begin if L + 1 >= C then break; Q^ := Char((Ord((P + 1)^) shl 6) + (Ord((P + 2)^) shr 2)); (Q + 1)^ := Char($D8 + (Ord((P + 1)^) shr 2)); (Q + 2)^ := (P + 3)^; (Q + 3)^ := Char($DC + (3 and Ord((P + 2)^))); Inc(Q, 4); Inc(P, 4); Inc(L, 2); end else begin if L >= C then break; Q^ := (P + 3)^; (Q + 1)^ := (P + 2)^; Inc(Q, 2); Inc(P, 4); Inc(L); end; end; end; DestLength := L; ProcessedBytes := P - Buf; end; function TUCS4BECodec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var P, N : PWideChar; Q : PChar; M : Integer; HighSurrogate : Word; LowSurrogate : Word; begin P := S; if not Assigned(P) or (Length <= 0) then begin ProcessedChars := 0; Result := ''; exit; end; SetLength(Result, Length * 4); Q := Pointer(Result); M := 0; N := P + Length; While P < N do Case Ord(P^) of $D800..$DBFF: // High surrogate of Unicode character [$10000..$10FFFF] begin if P = N - 1 then // End of WideString? raise EConvertError.Create(SLowSurrogateNotFound); HighSurrogate := Ord(P^); Inc(P); Inc(M, 2); LowSurrogate := Ord(P^); Case LowSurrogate of // Low Surrogate following? $DC00..$DF00: begin Q^ := Char(0); (Q+1)^ := Char((HighSurrogate - $D7C0) shr 6); (Q+2)^ := Char(((HighSurrogate and $3F) shl 2) + ((LowSurrogate - $DC00) shr 8)); (Q+3)^ := Char(Lo(LowSurrogate)); Inc(P); Inc(Q, 4); Inc(M, 4); end; else raise EConvertError.Create(SLowSurrogateNotFound); end; end; $DC00..$DFFF: // Low surrogate of Unicode character [$10000..$10FFFF] raise EConvertError.Create(SHighSurrogateNotFound); else Q^ := Char(0); (Q+1)^ := Char(0); (Q+2)^ := Char(Hi(Ord(P^))); (Q+3)^ := Char(Lo(Ord(P^))); Inc(P); Inc(Q, 4); Inc(M, 4); end; SetLength(Result, M); ProcessedChars := Length; end; procedure TUCS4BECodec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); var B : Array[0..3] of Byte; begin if ReadBuffer(B, 4) then begin C := B[0] * $1000000 + B[1] * $10000 + B[2] * $100 + B[3]; Case C of $D800..$DFFF: // Do not accept surrogates raise EConvertError.CreateFmt(SSurrogateNotAllowed, [C, 'UCS-4BE']); end; ByteCount := 4; end else begin C := UCS4_STRING_TERMINATOR; ByteCount := 0; end; end; procedure TUCS4BECodec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var Buffer : Array[0..3] of Byte; begin Buffer[0] := C shr 24; Buffer[1] := (C and $FF0000) shr 16; Buffer[2] := (C and $FF00) shr 8; Buffer[3] := C and $FF; WriteBuffer(Buffer, 4); ByteCount := 4; end; procedure TUCS4BECodec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); const UCS4BE_LF : Array[0..3] of Byte = ($0A, $00, $00, $00); UCS4BE_CR : Array[0..3] of Byte = ($0D, $00, $00, $00); UCS4BE_CRLF : Array[0..7] of Byte = ($0D, $00, $00, $00, $0A, $00, $00, $00); begin if C = UCS4_LF then Case WriteLFOption of lwLF: begin WriteBuffer(UCS4BE_LF, 4); ByteCount := 4; end; lwCR: begin WriteBuffer(UCS4BE_CR, 4); ByteCount := 4; end; lwCRLF: begin WriteBuffer(UCS4BE_CRLF, 8); ByteCount := 8; end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { TUCS4LECodec } { } procedure TUCS4LECodec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var Ch4 : UCS4Char; N, P, Q : PChar; L, C : Integer; begin P := Buf; Q := DestBuf; C := DestSize div 2; if not Assigned(P) or (BufSize <= 0) or not Assigned(Q) or (C <= 0) then begin ProcessedBytes := 0; DestLength := 0; exit; end; L := 0; N := P + BufSize - 4; While P <= N do begin Ch4 := Ord((P + 3)^) * $1000000 + Ord((P + 2)^) * $10000 + Ord((P + 1)^) * $100 + Ord(P^); if ((Ch4 >= $D800) and (Ch4 < $E000)) or (Ch4 > $10FFFF) then Case FErrorAction of eaException : if Ch4 > $10FFFF then RaiseUnicodeCodecException(SCannotConvertUCS4, [Ch4, 'UTF-16'], P - Buf) else RaiseUnicodeCodecException(SSurrogateNotAllowed, [Ch4, 'UCS-4'], P - Buf); eaStop : break; eaSkip : Inc(P, 4); eaIgnore : begin if L + 1 >= C then break; Q^ := P^; (Q + 1)^ := (P + 1)^; (Q + 2)^ := (P + 2)^; (Q + 3)^ := (P + 3)^; Inc(Q, 4); Inc(P, 4); Inc(L, 2); end; eaReplace : begin if L >= C then break; Q^ := Char(Lo(Ord(FDecodeReplaceChar))); (Q + 1)^ := Char(Hi(Ord(FDecodeReplaceChar))); Inc(Q, 2); Inc(P, 4); Inc(L); end; end else if Ch4 > $FFFF then begin if L + 1 >= C then break; Q^ := Char((Ord((P + 2)^) shl 6) + (Ord((P + 1)^) shr 2)); (Q + 1)^ := Char($D8 + (Ord((P + 2)^) shr 2)); (Q + 2)^ := P^; (Q + 3)^ := Char($DC + (3 and Ord((P + 1)^))); Inc(Q, 4); Inc(P, 4); Inc(L, 2); end else begin if L >= C then break; Q^ := P^; (Q + 1)^ := (P + 1)^; Inc(Q, 2); Inc(P, 4); Inc(L); end; end; DestLength := L; ProcessedBytes := P - Buf; end; function TUCS4LECodec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var P, N : PWideChar; Q : PChar; M : Integer; HighSurrogate : Word; LowSurrogate : Word; begin P := S; if not Assigned(P) or (Length <= 0) then begin ProcessedChars := 0; Result := ''; exit; end; SetLength(Result, Length * 4); Q := Pointer(Result); M := 0; N := P + Length; While P < N do Case Ord(P^) of $D800..$DBFF: // High surrogate of Unicode character [$10000..$10FFFF] begin if P = N - 1 then // End of WideString? raise EConvertError.Create(SLowSurrogateNotFound); HighSurrogate := Ord(P^); Inc(P); Inc(M, 2); LowSurrogate := Ord(P^); Case LowSurrogate of // Low Surrogate following? $DC00..$DF00: begin Q^ := Char(Lo(LowSurrogate)); (Q + 1)^ := Char(((HighSurrogate and $3F) shl 2) + ((LowSurrogate - $DC00) shr 8)); (Q + 2)^ := Char((HighSurrogate - $D7C0) shr 6); (Q + 3)^ := Char(0); Inc(P); Inc(Q, 4); Inc(M, 4); end; else raise EConvertError.Create(SLowSurrogateNotFound); end; end; $DC00..$DFFF: // Low surrogate of Unicode character [$10000..$10FFFF] raise EConvertError.Create(SHighSurrogateNotFound); else Q^ := Char(0); (Q + 1)^ := Char(0); (Q + 2)^ := Char(Hi(Ord(P^))); (Q + 3)^ := Char(Lo(Ord(P^))); Inc(P); Inc(Q, 4); Inc(M, 4); end; SetLength(Result, M); ProcessedChars := Length; end; procedure TUCS4LECodec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); var B : Array[0..3] of Byte; begin if ReadBuffer(B, 4) then begin C := B[3] * $1000000 + B[2] * $10000 + B[1] * $100 + B[0]; Case C of $D800..$DFFF: // Do not accept surrogates raise EConvertError.CreateFmt(SSurrogateNotAllowed, [C, 'UCS-4LE']); end; ByteCount := 4; end else begin C := UCS4_STRING_TERMINATOR; ByteCount := 0; end; end; procedure TUCS4LECodec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var Buffer : Array[0..3] of Byte; begin Buffer[0] := C and $FF; Buffer[1] := (C and $FF00) shr 8; Buffer[2] := (C and $FF0000) shr 16; Buffer[3] := C shr 24; WriteBuffer(Buffer, 4); ByteCount := 4; end; procedure TUCS4LECodec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); const UCS4LE_LF : Array[0..3] of Byte = ($00, $00, $00, $0A); UCS4LE_CR : Array[0..3] of Byte = ($00, $00, $00, $0D); UCS4LE_CRLF : Array[0..7] of Byte = ($00, $00, $00, $0D, $00, $00, $00, $0A); begin if C = UCS4_LF then Case WriteLFOption of lwLF: begin WriteBuffer(UCS4LE_LF, 4); ByteCount := 4; end; lwCR: begin WriteBuffer(UCS4LE_CR, 4); ByteCount := 4; end; lwCRLF: begin WriteBuffer(UCS4LE_CRLF, 8); ByteCount := 8; end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { TUCS4_2143Codec } { } procedure TUCS4_2143Codec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var Ch4 : UCS4Char; N, P, Q : PChar; L, C : Integer; begin P := Buf; Q := DestBuf; C := DestSize div 2; if not Assigned(P) or (BufSize <= 0) or not Assigned(Q) or (C <= 0) then begin ProcessedBytes := 0; DestLength := 0; exit; end; L := 0; N := P + BufSize - 4; While P <= N do begin Ch4 := Ord((P + 1)^) * $1000000 + Ord(P^) * $10000 + Ord((P + 3)^) * $100 + Ord((P + 2)^); if ((Ch4 >= $D800) and (Ch4 < $E000)) or (Ch4 > $10FFFF) then Case FErrorAction of eaException : if Ch4 > $10FFFF then RaiseUnicodeCodecException(SCannotConvertUCS4, [Ch4, 'UTF-16'], P - Buf) else RaiseUnicodeCodecException(SSurrogateNotAllowed, [Ch4, 'UCS-4'], P - Buf); eaStop : break; eaSkip : Inc(P, 4); eaIgnore : begin if L + 1 >= C then break; Q^ := P^; (Q + 1)^ := (P + 1)^; (Q + 2)^ := (P + 2)^; (Q + 3)^ := (P + 3)^; Inc(Q, 4); Inc(P, 4); Inc(L, 2); end; eaReplace : begin if L >= C then break; Q^ := Char(Lo(Ord(FDecodeReplaceChar))); (Q + 1)^ := Char(Hi(Ord(FDecodeReplaceChar))); Inc(Q, 2); Inc(P, 4); Inc(L); end; end else if Ch4 > $FFFF then begin if L + 1 >= C then break; Q^ := Char((Ord(P^) shl 6) + (Ord((P + 3)^) shr 2)); (Q + 1)^ := Char($D8 + (Ord(P^) shr 2)); (Q + 2)^ := (P + 2)^; (Q + 3)^ := Char($DC + (3 and Ord((P + 3)^))); Inc(Q, 4); Inc(P, 4); Inc(L, 2); end else begin if L >= C then break; Q^ := (P + 2)^; (Q + 1)^ := (P + 3)^; Inc(Q, 2); Inc(P, 4); Inc(L); end; end; DestLength := L; ProcessedBytes := P - Buf; end; function TUCS4_2143Codec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var P, N : PWideChar; Q : PChar; M : Integer; HighSurrogate : Word; LowSurrogate : Word; begin P := S; if not Assigned(P) or (Length <= 0) then begin ProcessedChars := 0; Result := ''; exit; end; SetLength(Result, Length * 4); Q := Pointer(Result); M := 0; N := P + Length; While P < N do begin Case Ord(P^) of $D800..$DBFF: // High surrogate of Unicode character [$10000..$10FFFF] begin if P = N - 1 then // End of WideString? raise EConvertError.Create(SLowSurrogateNotFound); HighSurrogate := Ord(P^); Inc(P); Inc(M, 2); LowSurrogate := Ord(P^); Case LowSurrogate of // Low Surrogate following? $DC00..$DF00: begin Q^ := Char((HighSurrogate - $D7C0) shr 6); (Q + 1)^ := Char(0); (Q + 2)^ := Char(Lo(LowSurrogate)); (Q + 3)^ := Char(((HighSurrogate and $3F) shl 2) + ((LowSurrogate - $DC00) shr 8)); Inc(P); Inc(Q, 4); Inc(M, 4); end; else raise EConvertError.Create(SLowSurrogateNotFound); end; end; $DC00..$DFFF: // Low surrogate of Unicode character [$10000..$10FFFF] raise EConvertError.Create(SHighSurrogateNotFound); else Q^ := Char(0); (Q + 1)^ := Char(0); (Q + 2)^ := Char(Lo(Ord(P^))); (Q + 3)^ := Char(Hi(Ord(P^))); Inc(P); Inc(Q, 4); Inc(M, 4); end; end; SetLength(Result, M); ProcessedChars := Length; end; procedure TUCS4_2143Codec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); var B : Array[0..3] of Byte; begin if ReadBuffer(B, 4) then begin C := B[1] * $1000000 + B[0] * $10000 + B[3] * $100 + B[2]; Case C of $D800..$DFFF: // Do not accept surrogates raise EConvertError.CreateFmt(SSurrogateNotAllowed, [C, 'UCS-4']); end; end else C := UCS4_STRING_TERMINATOR; ByteCount := 4; end; procedure TUCS4_2143Codec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var Buffer : Array[0..3] of Byte; begin Buffer[0] := (C and $FF0000) shr 16; Buffer[1] := C shr 24; Buffer[2] := C and $FF; Buffer[3] := (C and $FF00) shr 8; WriteBuffer(Buffer, 4); ByteCount := 4; end; procedure TUCS4_2143Codec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); const UCS4_2143_LF : Array[0..3] of Byte = ($00, $0A, $00, $00); UCS4_2143_CR : Array[0..3] of Byte = ($00, $0D, $00, $00); UCS4_2143_CRLF : Array[0..7] of Byte = ($00, $0D, $00, $00, $00, $0A, $00, $00); begin if C = UCS4_LF then Case WriteLFOption of lwLF: begin WriteBuffer(UCS4_2143_LF, 4); ByteCount := 4; end; lwCR: begin WriteBuffer(UCS4_2143_CR, 4); ByteCount := 4; end; lwCRLF: begin WriteBuffer(UCS4_2143_CRLF, 8); ByteCount := 8; end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { TUCS4_3412Codec } { } procedure TUCS4_3412Codec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var Ch4 : UCS4Char; N, P, Q : PChar; L, C : Integer; begin P := Buf; Q := DestBuf; C := DestSize div 2; if not Assigned(P) or (BufSize <= 0) or not Assigned(Q) or (C <= 0) then begin ProcessedBytes := 0; DestLength := 0; exit; end; L := 0; N := P + BufSize - 4; While P <= N do begin Ch4 := Ord((P + 2)^) * $1000000 + Ord((P + 3)^) * $10000 + Ord(P^) * $100 + Ord((P + 1)^); if ((Ch4 >= $D800) and (Ch4 < $E000)) or (Ch4 > $10FFFF) then Case FErrorAction of eaException : if Ch4 > $10FFFF then RaiseUnicodeCodecException(SCannotConvertUCS4, [Ch4, 'UTF-16'], P - Buf) else RaiseUnicodeCodecException(SSurrogateNotAllowed, [Ch4, 'UCS-4'], P - Buf); eaStop : break; eaSkip : Inc(P, 4); eaIgnore : begin if L + 1 >= C then break; Q^ := (P + 1)^; // Nevertheless change Big Endian to Little Endian ... (Q + 1)^ := P^; (Q + 2)^ := (P + 3)^; (Q + 3)^ := (P + 2)^; Inc(Q, 4); Inc(P, 4); Inc(L, 2); end; eaReplace : begin if L >= C then break; Q^ := Char(Lo(Ord(FDecodeReplaceChar))); (Q + 1)^ := Char(Hi(Ord(FDecodeReplaceChar))); Inc(Q, 2); Inc(P, 4); Inc(L); end; end else if Ch4 > $FFFF then begin if L + 1 >= C then break; Q^ := Char((Ord((P + 3)^) shl 6) + (Ord(P^) shr 2)); (Q + 1)^ := Char($D8 + (Ord((P + 3)^) shr 2)); (Q + 2)^ := (P + 1)^; (Q + 3)^ := Char($DC + (3 and Ord(P^))); Inc(Q, 4); Inc(P, 4); Inc(L, 2); end else begin if L >= C then break; Q^ := (P + 1)^; (Q + 1)^ := P^; Inc(Q, 2); Inc(P, 4); Inc(L); end; end; DestLength := L; ProcessedBytes := P - Buf; end; function TUCS4_3412Codec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var P, N : PWideChar; Q : PChar; M : Integer; HighSurrogate : Word; LowSurrogate : Word; begin P := S; if not Assigned(P) or (Length <= 0) then begin ProcessedChars := 0; Result := ''; exit; end; SetLength(Result, Length * 4); Q := Pointer(Result); M := 0; N := P + Length; While P < N do Case Ord(P^) of $D800..$DBFF: // High surrogate of Unicode character [$10000..$10FFFF] begin if P = N - 1 then // End of WideString? raise EConvertError.Create(SLowSurrogateNotFound); HighSurrogate := Ord(P^); Inc(P); Inc(M, 2); LowSurrogate := Ord(P^); Case LowSurrogate of // Low Surrogate following? $DC00..$DF00: begin Q^ := Char(((HighSurrogate and $3F) shl 2) + ((LowSurrogate - $DC00) shr 8)); (Q + 1)^ := Char(Lo(LowSurrogate)); (Q + 2)^ := Char(0); (Q + 3)^ := Char((HighSurrogate - $D7C0) shr 6); Inc(P); Inc(Q, 4); Inc(M, 4); end; else raise EConvertError.Create(SLowSurrogateNotFound); end; end; $DC00..$DFFF: // Low surrogate of Unicode character [$10000..$10FFFF] raise EConvertError.Create(SHighSurrogateNotFound); else Q^ := Char(Hi(Ord(P^))); (Q + 1)^ := Char(Lo(Ord(P^))); (Q + 2)^ := Char(0); (Q + 3)^ := Char(0); Inc(P); Inc(Q, 4); Inc(M, 4); end; SetLength(Result, M); ProcessedChars := Length; end; procedure TUCS4_3412Codec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); var B : Array[0..3] of Byte; begin if ReadBuffer(B, 4) then begin C := B[2] * $1000000 + B[3] * $10000 + B[0] * $100 + B[1]; Case C of $D800..$DFFF: // Do not accept surrogates raise EConvertError.CreateFmt(SSurrogateNotAllowed, [C, 'UCS-4']); end; ByteCount := 4; end else begin C := UCS4_STRING_TERMINATOR; ByteCount := 0; end; end; procedure TUCS4_3412Codec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var Buffer : Array[0..3] of Byte; begin Buffer[0] := (C and $FF00) shr 8; Buffer[1] := C and $FF; Buffer[2] := C shr 24; Buffer[3] := (C and $FF0000) shr 16; WriteBuffer(Buffer, 4); ByteCount := 4; end; procedure TUCS4_3412Codec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); const UCS4_3412_LF : Array[0..3] of Byte = ($00, $00, $0A, $00); UCS4_3412_CR : Array[0..3] of Byte = ($00, $00, $0D, $00); UCS4_3412_CRLF : Array[0..7] of Byte = ($00, $00, $0D, $00, $00, $00, $0A, $00); begin if C = UCS4_LF then Case WriteLFOption of lwLF: begin WriteBuffer(UCS4_3412_LF, 4); ByteCount := 4; end; lwCR: begin WriteBuffer(UCS4_3412_CR, 4); ByteCount := 4; end; lwCRLF: begin WriteBuffer(UCS4_3412_CRLF, 8); ByteCount := 8; end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { TUCS2Codec } { } procedure TUCS2Codec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var P : PWideChar; Q : PWideChar; I, L, C : Integer; begin P := Buf; Q := DestBuf; C := DestSize div Sizeof(WideChar); if not Assigned(P) or (BufSize <= 0) or not Assigned(Q) or (C <= 0) then begin ProcessedBytes := 0; DestLength := 0; exit; end; L := 0; For I := 1 to BufSize do if (Ord(P^) >= $D800) and (Ord(P^) < $E000) then // Do not accept surrogates Case FErrorAction of eaException : RaiseUnicodeCodecException(SSurrogateNotAllowed, [Ord(P^), 'UCS-2'], P - Buf); eaStop : break; eaSkip : Inc(P); eaIgnore : begin Q^ := WideChar(P^); Inc(P); Inc(Q); Inc(L); end; eaReplace : begin Q^ := FDecodeReplaceChar; Inc(P); Inc(Q); Inc(L); end; end else begin if L >= C then break; Q^ := P^; Inc(P); Inc(Q); Inc(L); end; DestLength := L; ProcessedBytes := P - Buf; end; function TUCS2Codec.Encode(const S: PWideChar; const Length: Integer; out ProcessedChars: Integer): String; var P : PWideChar; Q : PWideChar; I, L, M : Integer; begin Q := S; if not Assigned(Q) or (Length <= 0) then begin ProcessedChars := 0; Result := ''; exit; end; SetLength(Result, Length*2); L := 0; M := 0; P := Pointer(Result); For I := 1 to Length do if (Ord(P^) >= $D800) and (Ord(P^) < $E000) then // Do not accept surrogates Case FErrorAction of eaException : RaiseUnicodeCodecException(SSurrogateNotAllowed, [Ord(P^), 'UCS-2'], L * 2); eaStop : break; eaSkip : begin Inc(Q); Inc(L); end; eaIgnore : begin P^ := Q^; Inc(P); Inc(Q); Inc(L); Inc(M); end; eaReplace : begin P^ := FDecodeReplaceChar; Inc(P); Inc(Q); Inc(L); Inc(M); end; end else begin P^ := Q^; Inc(P); Inc(Q); Inc(L); Inc(M); end; if Length <> M then SetLength(Result, M * 2); ProcessedChars := L; end; procedure TUCS2Codec.InternalReadUCS4Char(out C: UCS4Char; out ByteCount: Integer); begin C := 0; // C must be initialized, because the ReadBuffer(C, 2) call below does // not fill the whole variable! if not ReadBuffer(C, 2) then begin C := UCS4_STRING_TERMINATOR; ByteCount := 0; exit; end; C := Swap(C); // UCS4Chars are stored in Little Endian mode; so we need to swap the bytes. ByteCount := 2; Case C of $D800..$DFFF: // Do not accept surrogates raise EConvertError.CreateFmt(SSurrogateNotAllowed, [C, 'UCS-2']); end; end; procedure TUCS2Codec.InternalWriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); var HighByte, LowByte: Byte; begin if C > $FFFF then raise EConvertError.CreateFmt(SEncodingOutOfRange, ['UCS-2']); HighByte := Hi(C); LowByte := Lo(C); WriteBuffer(HighByte, 1); WriteBuffer(LowByte, 1); ByteCount := 2; end; procedure TUCS2Codec.WriteUCS4Char(const C: UCS4Char; out ByteCount: Integer); const UCS2_LF : Array[0..1] of Byte = ($00, $0A); UCS2_CR : Array[0..1] of Byte = ($00, $0D); UCS2_CRLF : Array[0..3] of Byte = ($00, $0D, $00, $0A); begin if C = UCS4_LF then Case WriteLFOption of lwLF: begin WriteBuffer(UCS2_LF, 2); ByteCount := 2; end; lwCR: begin WriteBuffer(UCS2_CR, 2); ByteCount := 2; end; lwCRLF: begin WriteBuffer(UCS2_CRLF, 4); ByteCount := 4; end; end else InternalWriteUCS4Char(C, ByteCount); end; { } { ISO-8859-1 - Latin 1 } { } function TISO8859_1Codec.DecodeChar(const P: AnsiChar): WideChar; begin Result := WideChar(P); end; function TISO8859_1Codec.EncodeChar(const Ch: WideChar): AnsiChar; begin if Ord(Ch) >= $100 then raise EConvertError.CreateFmt(SCannotConvert, [Ord(Ch), 'ISO-8859-1']); Result := AnsiChar(Ch); end; procedure TISO8859_1Codec.Decode(const Buf: Pointer; const BufSize: Integer; const DestBuf: Pointer; const DestSize: Integer; out ProcessedBytes, DestLength: Integer); var L, C: Integer; begin L := BufSize; C := DestSize div Sizeof(WideChar); if C < L then L := C; if L < 0 then L := 0; ProcessedBytes := L; DestLength := L; LongToWide(Buf, L, DestBuf); end; { } { ISO-8859-2 Latin 2 } { } const ISO8859_2Map : AnsiCharISOMap = ( #$00A0, #$0104, #$02D8, #$0141, #$00A4, #$013D, #$015A, #$00A7, #$00A8, #$0160, #$015E, #$0164, #$0179, #$00AD, #$017D, #$017B, #$00B0, #$0105, #$02DB, #$0142, #$00B4, #$013E, #$015B, #$02C7, #$00B8, #$0161, #$015F, #$0165, #$017A, #$02DD, #$017E, #$017C, #$0154, #$00C1, #$00C2, #$0102, #$00C4, #$0139, #$0106, #$00C7, #$010C, #$00C9, #$0118, #$00CB, #$011A, #$00CD, #$00CE, #$010E, #$0110, #$0143, #$0147, #$00D3, #$00D4, #$0150, #$00D6, #$00D7, #$0158, #$016E, #$00DA, #$0170, #$00DC, #$00DD, #$0162, #$00DF, #$0155, #$00E1, #$00E2, #$0103, #$00E4, #$013A, #$0107, #$00E7, #$010D, #$00E9, #$0119, #$00EB, #$011B, #$00ED, #$00EE, #$010F, #$0111, #$0144, #$0148, #$00F3, #$00F4, #$0151, #$00F6, #$00F7, #$0159, #$016F, #$00FA, #$0171, #$00FC, #$00FD, #$0163, #$02D9); function TISO8859_2Codec.DecodeChar(const P: AnsiChar): WideChar; begin if Ord(P) < $A0 then Result := WideChar(P) else Result := ISO8859_2Map[P]; end; function TISO8859_2Codec.EncodeChar(const Ch: WideChar): AnsiChar; begin Result := CharFromISOMap(Ch, ISO8859_2Map, 'ISO-8859-2'); end;