Einfache Code-Algorithmus arbeitet:Unterschiedliche Verhaltensweisen Algorithmus, wenn sie mit einem UTF8 auf verschiedenen Betriebssystemen
#include <iostream>
#include <string>
std::string::size_type GetLengthWithUTF(std::string &sValue);
int main()
{
std::string sTestValueUTF8 = "\xD0\xB6\xD0\xB6\xD0\xB6";
std::string sTestValueASCII = "\x67\x67\x67";
std::string sTestValueMIX = "\x67\x67\x67\xD0\xB6\xD0\xB6\xD0\xB6";
std::string::size_type iFuncResult = 0;
std::cout << "=========== START TEST ==========\n\n";
std::cout << "+TEST UTF8 STRING\n";
std::cout << "+----+Bytes of string (sTestValueUTF8.length()) = " << sTestValueUTF8.length() << "\n";
iFuncResult = GetLengthWithUTF(sTestValueUTF8);
std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueUTF8 << "\")) = " << iFuncResult<< "\n\n";
std::cout << "+TEST ASCII STRING\n";
std::cout << "+----+Bytes of string (sTestValueASCII.length()) = " << sTestValueASCII.length() << "\n";
iFuncResult = GetLengthWithUTF(sTestValueASCII);
std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueASCII << "\")) = " << iFuncResult<< "\n\n";
std::cout << "+TEST MIX STRING\n";
std::cout << "+----+Bytes of string (sTestValueMIX.length()) = " << sTestValueMIX.length() << "\n";
iFuncResult = GetLengthWithUTF(sTestValueMIX);
std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueMIX << "\")) = " << iFuncResult<< "\n\n";
std::cout << "\n=========== END TEST ==========\n\n";
}
std::string::size_type GetLengthWithUTF(std::string &sValue)
{
std::cout << " +----+START GetLengthWithUTF\n";
std::cout << " +Input string is: " << sValue << "\n";
std::string::size_type i;
std::cout << " +Start cycle\n";
int iCountUTF8characters = 0;
for (i = 0; i < sValue.length(); i++)
{
std::cout << " +----+Iteration N " << i << "\n";
std::cout << " +Current character is: " << sValue[i] << ", integer value = " << (int)sValue[i] << "\n";
if (sValue[i] > 127)
{
iCountUTF8characters++;
std::cout << " +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: " << iCountUTF8characters << "\n";
}
else
{
std::cout << " +----+If statement (sValue[i] > 127) is false.\n";
}
}
std::cout << " +End cycle\n";
iCountUTF8characters = iCountUTF8characters/2;
std::cout << " +Return sValue.length() - (iCountUTF8characters/2) ---> " << sValue.length() << " - (" << iCountUTF8characters << "/2) = " << (sValue.length() - (std::string::size_type)iCountUTF8characters) <<"\n";
std::cout << " +----+ASCIID GetLengthWithUTF\n";
return (sValue.length() - (std::string::size_type)iCountUTF8characters);
}
Console kompilieren Befehle:
AIX 6
g++ -o test test.cpp
RHEL Server 6.7 Santiago
g++ -o test test.cpp
Microsoft W indows v10.0.14393
cl /EHsc test.cpp
Ergebnisse:
AIX 6
=========== START TEST ==========
+TEST UTF8 STRING
+----+Bytes of string (sTestValueUTF8.length()) = 6
+----+START GetLengthWithUTF
+Input string is: жжж
+Start cycle
+----+Iteration N 0
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 1
+----+Iteration N 1
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 2
+----+Iteration N 2
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 3
+----+Iteration N 3
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 4
+----+Iteration N 4
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 5
+----+Iteration N 5
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 6
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 6 - (3/2) = 3
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("жжж")) = 3
+TEST ASCII STRING
+----+Bytes of string (sTestValueASCII.length()) = 3
+----+START GetLengthWithUTF
+Input string is: ggg
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 3 - (0/2) = 3
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("ggg")) = 3
+TEST MIX STRING
+----+Bytes of string (sTestValueMIX.length()) = 9
+----+START GetLengthWithUTF
+Input string is: gggжжж
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 3
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 1
+----+Iteration N 4
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 2
+----+Iteration N 5
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 3
+----+Iteration N 6
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 4
+----+Iteration N 7
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 5
+----+Iteration N 8
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 6
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 9 - (3/2) = 6
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("gggжжж")) = 6
=========== END TEST ==========
RHEL Server 6.7 Santiago
=========== START TEST ==========
+TEST UTF8 STRING
+----+Bytes of string (sTestValueUTF8.length()) = 6
+----+START GetLengthWithUTF
+Input string is: жжж
+Start cycle
+----+Iteration N 0
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 3
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 4
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 5
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 6 - (0/2) = 6
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("жжж")) = 6
+TEST ASCII STRING
+----+Bytes of string (sTestValueASCII.length()) = 3
+----+START GetLengthWithUTF
+Input string is: ggg
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 3 - (0/2) = 3
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("ggg")) = 3
+TEST MIX STRING
+----+Bytes of string (sTestValueMIX.length()) = 9
+----+START GetLengthWithUTF
+Input string is: gggжжж
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 3
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 4
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 5
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 6
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 7
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 8
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 9 - (0/2) = 9
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("gggжжж")) = 9
=========== END TEST ==========
Microsoft Windows v10.0.14393
=========== START TEST ==========
+TEST UTF8 STRING
+----+Bytes of string (sTestValueUTF8.length()) = 6
+----+START GetLengthWithUTF
+Input string is: жжж
+Start cycle
+----+Iteration N 0
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 3
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 4
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 5
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 6 - (0/2) = 6
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("жжж")) = 6
+TEST ASCII STRING
+----+Bytes of string (sTestValueASCII.length()) = 3
+----+START GetLengthWithUTF
+Input string is: ggg
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 3 - (0/2) = 3
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("ggg")) = 3
+TEST MIX STRING
+----+Bytes of string (sTestValueMIX.length()) = 9
+----+START GetLengthWithUTF
+Input string is: gggжжж
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 3
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 4
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 5
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 6
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 7
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 8
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters/2) ---> 9 - (0/2) = 9
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("gggжжж")) = 9
=========== END TEST ==========
Der Algorithmus muss die Anzahl der Zeichen in einer Zeichenfolge berechnen. Wie Sie den Ergebnissen der Tests entnehmen können, funktioniert es nur unter AIX korrekt.
Ich bin froh, wenn mir jemand helfen kann, dieses absurde Verhalten des Algorithmus für verschiedene Betriebssysteme zu verstehen. Der Algorithmus wurde unter OS AIX erstellt. Nach der Migration von AIX nach LINUX stellt sich heraus, dass ein Problem damit besteht und ich habe umfangreichere Tests durchgeführt, deren Ergebnisse Sie sehen. Meine Hauptfrage ist, wie verdammt Algorithmus unter AIX funktioniert. Ich kann es nicht logisch erklären.
Dieser Algorithmus ist falsch; Es funktioniert nur mit einer kleinen Untermenge von Unicode. Ein besserer Algorithmus ist es, die Anzahl der Bytes so zu zählen, dass 'ch & 0xC0! = 0x80', was nur nicht-initiale Codes (solche im Bereich 0x80-0xBF) eliminiert. – rici
Ja, du hast Recht. Dieser Algorithmus ist veraltet, sehr alt und prüft Zeichenfolgen unter 200 Zeichen. Er hat jedoch den Algorithmus wie oben beschrieben geändert. Ich war nur interessant das Problem zu kennen. – stoyanov