Spinning Topp Logo BlackTopp Studios
inc
unicode.h
Go to the documentation of this file.
1 // The UTF8 Conversion Library is a small library aiding in the converstion from raw text to UTF8 text.
2 // © Copyright 2010 - 2016 BlackTopp Studios Inc.
3 /* This file is part of The UTF8 Conversion Library.
4 
5  The UTF8 Conversion Library is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  The UTF8 Conversion Library is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with The UTF8 Conversion Library. If not, see <http://www.gnu.org/licenses/>.
17 */
18 /* The original authors have included a copy of the license specified above in the
19  'doc' folder. See 'gpl.txt'
20 */
21 /* We welcome the use of the UTF8 Conversion Library to anyone, including companies who wish to
22  Build professional software and charge for their product.
23 
24  However there are some practical restrictions, so if your project involves
25  any of the following you should contact us and we will try to work something
26  out:
27  - DRM or Copy Protection of any kind(except Copyrights)
28  - Software Patents You Do Not Wish to Freely License
29  - Any Kind of Linking to Non-GPL licensed Works
30  - Are Currently In Violation of Another Copyright Holder's GPL License
31  - If You want to change our code and not add a few hundred MB of stuff to
32  your distribution
33 
34  These and other limitations could cause serious legal problems if you ignore
35  them, so it is best to simply contact us or the Free Software Foundation, if
36  you have any questions.
37 
38  Joseph Toppi - toppij@gmail.com
39  John Blackwood - makoenergy02@gmail.com
40 */
41 
42 #ifndef _unicode_h
43 #define _unicode_h
44 
45 #include "datatypes.h"
46 #include <iostream>
47 
48 /// @file
49 /// @brief This contains simple tools for indexing with UTF8 characters swiftly
50 
51 namespace Mezzanine
52 {
53  /// @brief A dumping ground for any feature that only relates to unicode work.
54  /// @details Unicode is a series of numbers that correlate to glyphs. These numbers
55  /// are seperate from any binary representation. Common binary represenations of the
56  /// numbers are UTF8, UTF16, and UTF32. This library supports UTF8, which uses between
57  /// 1 and 4 bytes to represent and valid Unicode glyph. The tools provided here allow
58  /// conversion between The raw Unicode value, which is useful for algorithms, and
59  /// its UTF8 representation, which is useful for storage and transmission.
60  namespace Unicode
61  {
62  const UInt8 High1Bit = (1<<7); ///< 1xxxxxxx - Is used compared against high 2 bits to determine if in middle of byte
63  const UInt8 High2Bit = High1Bit | (1<<6); ///< 11xxxxxx
64  const UInt8 High3Bit = High2Bit | (1<<5); ///< 111xxxxx
65  const UInt8 High4Bit = High3Bit | (1<<4); ///< 1111xxxx
66  const UInt8 High5Bit = High4Bit | (1<<3); ///< 11111xxx
67  const UInt8 High6Bit = High5Bit | (1<<2); ///< 111111xx
68  const UInt8 High7Bit = High6Bit | (1<<1); ///< 1111111x
69  const UInt8 High8Bit = High7Bit | 1; ///< 11111111
70 
71  /// @brief The index of this array corresponds to the amount of high bits that are set.
73 
74 
75  const UInt8 Low1Bit = (1); ///< xxxxxxx1
76  const UInt8 Low2Bit = Low1Bit | (1<<1); ///< xxxxxx11
77  const UInt8 Low3Bit = Low2Bit | (1<<2); ///< xxxxx111
78  const UInt8 Low4Bit = Low3Bit | (1<<3); ///< xxxx1111
79  const UInt8 Low5Bit = Low4Bit | (1<<4); ///< xxx11111
80  const UInt8 Low6Bit = Low5Bit | (1<<5); ///< xx111111
81  const UInt8 Low7Bit = Low6Bit | (1<<6); ///< x1111111
82  const UInt8 Low8Bit = Low7Bit | (1<<7); ///< 11111111
83 
84  /// @brief The index of this array corresponds to the amount of low bits that are set.
86 
87  const Int32 UTF8ByteRange1Max = 127; ///< The maximum Unicode codepoint that can fit into a single UTF8 byte. Equal to 2^7-1.
88  const Int32 UTF8ByteRange2Max = 4097; ///< The maximum Unicode codepoint that can fit into 2 UTF8 bytes. Equal to 2^11-1.
89  const Int32 UTF8ByteRange3Max = 65535; ///< The maximum Unicode codepoint that can fit into 3 UTF8 bytes. Equal to 2^16-1.
90  const Int32 UTF8ByteRange4Max = 2097151; ///< The maximum Unicode codepoint that can fit into 4 UTF8 bytes. Equal to 2^21-1.
91 
92  const UInt32 High1bytes = 0xFF000000; ///< The Highest byte of an integer on this system
93  const UInt32 High2bytes = 0xFFFF0000; ///< The Highest 2 bytes of an integer on this system
94  const UInt32 High3bytes = 0xFFFFFF00; ///< The Highest 3 bytes of an integer on this system
95 
96  const UInt32 UTF8Null2ByteBase = 49280; ///< This is the numerical representation 0 in a two UTF8 Sequence. Is equal to 11000000 10000000
97  const UInt32 UTF8Null3ByteBase = 14712960; ///< This is the numerical representation 0 in a three UTF8 Sequence. Is equal to 11100000 10000000 10000000
98  const UInt32 UTF8Null4ByteBase = 4034953344;///< This is the numerical representation 0 in a four UTF8 Sequence. Is equal to 11110000 10000000 10000000 10000000
99 
100 
101  /// @brief A helper function that produces a human readable sequence of ' ', '1' and '0' characters.
102  /// @param IntToPrint A 32 bit integer that will be used to create the sequence.
103  /// @return A Mezzanine::String containing '1' and '0' characters with a space every eight digits.
104  String AsBitString(Int32 IntToPrint);
105 
106  /// @brief Get a number suitable for using in an index from a character string.
107  /// @param BytesUsed The value of this variable is ignored and overwritten with the amount of bytes consumed from CurrentCharacter.
108  /// @param CurrentCharacter a pointer to a c style string.
109  /// @return If the character pointer to is the beginning of a valid UTF8 character a number suitable for using in an index is returned, otherwise some negative value is returned.
110  Int32 GetIntFromCharacter(Int32& BytesUsed, const char* CurrentCharacter);
111 
112  /// @brief Convert a number that represents any valid unicode value into its UTF8 representation.
113  /// @param Destination The place to write the results. Never more than 4 bytes will be written. Null terminators are not written.
114  /// @param BytesUsable How many byte of the Destination are usable.
115  /// @param ByteSequence The integer value to convert to a UTF8 unicode representation. This sequence must be representable in 21 or fewer bits(<4194304) to be valid.
116  /// @return The amount of bytes written to destination or -1 on error. This will never be more than 4,
117  Int32 GetCharacterFromInt(char* Destination, Int32 BytesUsable, Int32 ByteSequence);
118  }//Unicode
119 }//Mezzanine
120 
121 #endif
int32_t Int32
An 32-bit integer.
Definition: datatypes.h:124
const UInt8 High1Bit
1xxxxxxx - Is used compared against high 2 bits to determine if in middle of byte ...
Definition: unicode.h:62
const Int32 UTF8ByteRange3Max
The maximum Unicode codepoint that can fit into 3 UTF8 bytes. Equal to 2^16-1.
Definition: unicode.h:89
const UInt8 Low2Bit
xxxxxx11
Definition: unicode.h:76
const UInt8 Low4Bit
xxxx1111
Definition: unicode.h:78
const UInt32 UTF8Null3ByteBase
This is the numerical representation 0 in a three UTF8 Sequence. Is equal to 11100000 10000000 100000...
Definition: unicode.h:97
const UInt8 Low6Bit
xx111111
Definition: unicode.h:80
All the definitions for datatypes as well as some basic conversion functions are defined here...
const UInt8 High7Bit
1111111x
Definition: unicode.h:68
const UInt8 High3Bit
111xxxxx
Definition: unicode.h:64
String AsBitString(Int32 IntToPrint)
A helper function that produces a human readable sequence of ' ', '1' and '0' characters.
Definition: unicode.cpp:71
const UInt8 Low3Bit
xxxxx111
Definition: unicode.h:77
const UInt8 High2Bit
11xxxxxx
Definition: unicode.h:63
uint8_t UInt8
An 8-bit unsigned integer.
Definition: datatypes.h:118
const UInt32 High3bytes
The Highest 3 bytes of an integer on this system.
Definition: unicode.h:94
const UInt8 Low8Bit
11111111
Definition: unicode.h:82
const UInt8 IterableLowBits[]
The index of this array corresponds to the amount of low bits that are set.
Definition: unicode.h:85
const UInt8 Low5Bit
xxx11111
Definition: unicode.h:79
const UInt8 High8Bit
11111111
Definition: unicode.h:69
const Int32 UTF8ByteRange2Max
The maximum Unicode codepoint that can fit into 2 UTF8 bytes. Equal to 2^11-1.
Definition: unicode.h:88
uint32_t UInt32
An 32-bit unsigned integer.
Definition: datatypes.h:126
const UInt8 Low7Bit
x1111111
Definition: unicode.h:81
const UInt8 High6Bit
111111xx
Definition: unicode.h:67
const UInt8 IterableHighBits[]
The index of this array corresponds to the amount of high bits that are set.
Definition: unicode.h:72
const UInt8 Low1Bit
xxxxxxx1
Definition: unicode.h:75
const UInt32 UTF8Null4ByteBase
This is the numerical representation 0 in a four UTF8 Sequence. Is equal to 11110000 10000000 1000000...
Definition: unicode.h:98
const UInt32 High1bytes
The Highest byte of an integer on this system.
Definition: unicode.h:92
Int32 GetIntFromCharacter(Int32 &BytesUsed, const char *CurrentCharacter)
Get a number suitable for using in an index from a character string.
Definition: unicode.cpp:86
const UInt8 High5Bit
11111xxx
Definition: unicode.h:66
The bulk of the engine components go in this namspace.
Definition: actor.cpp:56
const UInt32 UTF8Null2ByteBase
This is the numerical representation 0 in a two UTF8 Sequence. Is equal to 11000000 10000000...
Definition: unicode.h:96
const Int32 UTF8ByteRange4Max
The maximum Unicode codepoint that can fit into 4 UTF8 bytes. Equal to 2^21-1.
Definition: unicode.h:90
const UInt32 High2bytes
The Highest 2 bytes of an integer on this system.
Definition: unicode.h:93
const UInt8 High4Bit
1111xxxx
Definition: unicode.h:65
const Int32 UTF8ByteRange1Max
The maximum Unicode codepoint that can fit into a single UTF8 byte. Equal to 2^7-1.
Definition: unicode.h:87
std::string String
A datatype used to a series of characters.
Definition: datatypes.h:159
Int32 GetCharacterFromInt(char *Destination, Int32 BytesUsable, Int32 ByteSequence)
Convert a number that represents any valid unicode value into its UTF8 representation.
Definition: unicode.cpp:116