Spinning Topp Logo BlackTopp Studios
inc
unicode.cpp
Go to the documentation of this file.
1 // The UTF8 Conversion Library is a small library aiding in the converstion from raw text to UTF8 text.
2 // © Copyright 2010 - 2016 BlackTopp Studios Inc.
3 /* This file is part of The UTF8 Conversion Library.
4 
5  The UTF8 Conversion Library is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  The UTF8 Conversion Library is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with The UTF8 Conversion Library. If not, see <http://www.gnu.org/licenses/>.
17 */
18 /* The original authors have included a copy of the license specified above in the
19  'doc' folder. See 'gpl.txt'
20 */
21 /* We welcome the use of the UTF8 Conversion Library to anyone, including companies who wish to
22  Build professional software and charge for their product.
23 
24  However there are some practical restrictions, so if your project involves
25  any of the following you should contact us and we will try to work something
26  out:
27  - DRM or Copy Protection of any kind(except Copyrights)
28  - Software Patents You Do Not Wish to Freely License
29  - Any Kind of Linking to Non-GPL licensed Works
30  - Are Currently In Violation of Another Copyright Holder's GPL License
31  - If You want to change our code and not add a few hundred MB of stuff to
32  your distribution
33 
34  These and other limitations could cause serious legal problems if you ignore
35  them, so it is best to simply contact us or the Free Software Foundation, if
36  you have any questions.
37 
38  Joseph Toppi - toppij@gmail.com
39  John Blackwood - makoenergy02@gmail.com
40 */
41 
42 #ifndef _unicode_cpp
43 #define _unicode_cpp
44 
45 #include "unicode.h"
46 
47 using namespace std;
48 
49 /// @file
50 /// @brief This contains simple tools for indexing with UTF8 characters swiftly
51 
52 namespace Mezzanine
53 {
54  namespace Unicode
55  {
56  namespace
57  {
58  /// @internal
59  /// Checks following bytes for UTF8 validity.
60  Boole CheckAsFollowingBytes(Int32 HowMany, const char* FirstChar)
61  {
62  for(Int32 Count = 1; HowMany>Count; Count++)
63  {
64  if ( (*(FirstChar+Count) & High2Bit) != High1Bit)
65  { return false; }
66  }
67  return true;
68  }
69  }
70 
71  String AsBitString(Int32 IntToPrint)
72  {
73  string Results;
74  for(int Counter=31; Counter>=0; Counter--)
75  {
76  if ( ((Counter+1)%8)==0 && Counter!=31)
77  { Results += " "; }
78  if ( (1<<Counter) & IntToPrint )
79  { Results += "1"; }
80  else
81  { Results += "0"; }
82  }
83  return Results;
84  }
85 
86  Int32 GetIntFromCharacter(Int32& BytesUsed, const char* CurrentCharacter)
87  {
88 
89  if( (*CurrentCharacter & High1Bit) == 0)
90  {
91  BytesUsed=1;
92  char Results = *CurrentCharacter;
93  return Results;
94  }
95 
96  for(Int32 Counter=2; Counter<7; Counter++)
97  {
98  if((*CurrentCharacter & IterableHighBits[Counter+1]) == IterableHighBits[Counter])
99  {
100  if( CheckAsFollowingBytes(Counter-1, CurrentCharacter) )
101  {
102  BytesUsed=Counter;
103  Int32 Results = (CurrentCharacter[0] & IterableLowBits[8-Counter]) << (6*(Counter-1));
104  for(Int32 Remains = 1; Remains<Counter; Remains++)
105  { Results |= (CurrentCharacter[Remains] & Low6Bit) << (6*(Counter-Remains-1)); }
106  return Results;
107  }
108  return -1;
109  }
110  }
111 
112  return -1;
113  }
114 
115  ///
116  Int32 GetCharacterFromInt(char* Destination, Int32 BytesUsable, Int32 ByteSequence)
117  {
118  ///
119  if(ByteSequence<UTF8ByteRange1Max)
120  {
121  //1 byte or invalid
122  if(0>ByteSequence)
123  { return -1; }
124  else
125  {
126  // 1 byte
127  if (1>BytesUsable)
128  { return -1; }
129  char Ascii = (char)ByteSequence;
130  Destination[0]=Ascii;
131  return 1;
132  }
133  }
134  else
135  {
136  //more than one
137  if(ByteSequence<UTF8ByteRange2Max)
138  {
139  // 2 bytes
140  if (2>BytesUsable)
141  { return -1; }
142  Int32 Results = UTF8Null2ByteBase;
143  Results |= (ByteSequence & Int32(Low6Bit));
144  Results |= (ByteSequence & (Int32(Low5Bit)<<6) ) << 2;
145  char* Bytes = (char*)&Results;
146  Destination[0]=Bytes[1];
147  Destination[1]=Bytes[0];
148  return 2;
149  }
150  else
151  {
152  if(ByteSequence<UTF8ByteRange3Max)
153  {
154  // 3 or more
155  if (3>BytesUsable)
156  { return -1; }
157  Int32 Results = UTF8Null3ByteBase;
158  Results |= (ByteSequence & Int32(Low6Bit));
159  Results |= (ByteSequence & (Int32(Low6Bit)<<6) ) << 2;
160  Results |= (ByteSequence & (Int32(Low4Bit)<<12) ) << 4;
161  char* Bytes = (char*)&Results;
162  Destination[0]=Bytes[2];
163  Destination[1]=Bytes[1];
164  Destination[2]=Bytes[0];
165  return 3;
166  }
167  else
168  {
169  if(ByteSequence<UTF8ByteRange4Max)
170  {
171  // 4 or more
172  if (4>BytesUsable)
173  { return -1; }
174  Int32 Results = UTF8Null4ByteBase;
175  Results |= (ByteSequence & Int32(Low6Bit));
176  Results |= (ByteSequence & (Int32(Low6Bit)<<6) ) << 2;
177  Results |= (ByteSequence & (Int32(Low6Bit)<<12) ) << 4;
178  Results |= (ByteSequence & (Int32(Low3Bit)<<18) ) << 6;
179  char* Bytes = (char*)&Results;
180  Destination[0]=Bytes[3];
181  Destination[1]=Bytes[2];
182  Destination[2]=Bytes[1];
183  Destination[3]=Bytes[0];
184  return 4;
185  }
186  else
187  { return -1; }
188 
189  }
190  }
191  }
192  return -1;
193  }
194  }//Unicode
195 }//Mezzanine
196 
197 #endif
int32_t Int32
An 32-bit integer.
Definition: datatypes.h:124
This contains simple tools for indexing with UTF8 characters swiftly.
const UInt8 High1Bit
1xxxxxxx - Is used compared against high 2 bits to determine if in middle of byte ...
Definition: unicode.h:62
const Int32 UTF8ByteRange3Max
The maximum Unicode codepoint that can fit into 3 UTF8 bytes. Equal to 2^16-1.
Definition: unicode.h:89
bool Boole
Generally acts a single bit, true or false.
Definition: datatypes.h:173
const UInt8 Low4Bit
xxxx1111
Definition: unicode.h:78
const UInt32 UTF8Null3ByteBase
This is the numerical representation 0 in a three UTF8 Sequence. Is equal to 11100000 10000000 100000...
Definition: unicode.h:97
const UInt8 Low6Bit
xx111111
Definition: unicode.h:80
STL namespace.
String AsBitString(Int32 IntToPrint)
A helper function that produces a human readable sequence of ' ', '1' and '0' characters.
Definition: unicode.cpp:71
const UInt8 Low3Bit
xxxxx111
Definition: unicode.h:77
const UInt8 High2Bit
11xxxxxx
Definition: unicode.h:63
const UInt8 IterableLowBits[]
The index of this array corresponds to the amount of low bits that are set.
Definition: unicode.h:85
const UInt8 Low5Bit
xxx11111
Definition: unicode.h:79
const Int32 UTF8ByteRange2Max
The maximum Unicode codepoint that can fit into 2 UTF8 bytes. Equal to 2^11-1.
Definition: unicode.h:88
const UInt8 IterableHighBits[]
The index of this array corresponds to the amount of high bits that are set.
Definition: unicode.h:72
const UInt32 UTF8Null4ByteBase
This is the numerical representation 0 in a four UTF8 Sequence. Is equal to 11110000 10000000 1000000...
Definition: unicode.h:98
Int32 GetIntFromCharacter(Int32 &BytesUsed, const char *CurrentCharacter)
Get a number suitable for using in an index from a character string.
Definition: unicode.cpp:86
The bulk of the engine components go in this namspace.
Definition: actor.cpp:56
const UInt32 UTF8Null2ByteBase
This is the numerical representation 0 in a two UTF8 Sequence. Is equal to 11000000 10000000...
Definition: unicode.h:96
const Int32 UTF8ByteRange4Max
The maximum Unicode codepoint that can fit into 4 UTF8 bytes. Equal to 2^21-1.
Definition: unicode.h:90
const Int32 UTF8ByteRange1Max
The maximum Unicode codepoint that can fit into a single UTF8 byte. Equal to 2^7-1.
Definition: unicode.h:87
std::string String
A datatype used to a series of characters.
Definition: datatypes.h:159
Int32 GetCharacterFromInt(char *Destination, Int32 BytesUsable, Int32 ByteSequence)
Convert a number that represents any valid unicode value into its UTF8 representation.
Definition: unicode.cpp:116