2016-11-18 3 views
3

Einfache Code-Algorithmus arbeitet:Unterschiedliche Verhaltensweisen Algorithmus, wenn sie mit einem UTF8 auf verschiedenen Betriebssystemen

#include <iostream> 
#include <string> 

std::string::size_type GetLengthWithUTF(std::string &sValue); 

int main() 
{ 
    std::string sTestValueUTF8 = "\xD0\xB6\xD0\xB6\xD0\xB6"; 
    std::string sTestValueASCII = "\x67\x67\x67"; 
    std::string sTestValueMIX = "\x67\x67\x67\xD0\xB6\xD0\xB6\xD0\xB6"; 
    std::string::size_type iFuncResult = 0; 

    std::cout << "=========== START TEST ==========\n\n"; 

    std::cout << "+TEST UTF8 STRING\n"; 
    std::cout << "+----+Bytes of string (sTestValueUTF8.length()) = " << sTestValueUTF8.length() << "\n"; 
    iFuncResult = GetLengthWithUTF(sTestValueUTF8); 
    std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueUTF8 << "\")) = " << iFuncResult<< "\n\n"; 

    std::cout << "+TEST ASCII STRING\n"; 
    std::cout << "+----+Bytes of string (sTestValueASCII.length()) = " << sTestValueASCII.length() << "\n"; 
    iFuncResult = GetLengthWithUTF(sTestValueASCII); 
    std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueASCII << "\")) = " << iFuncResult<< "\n\n"; 

    std::cout << "+TEST MIX STRING\n"; 
    std::cout << "+----+Bytes of string (sTestValueMIX.length()) = " << sTestValueMIX.length() << "\n"; 
    iFuncResult = GetLengthWithUTF(sTestValueMIX); 
    std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueMIX << "\")) = " << iFuncResult<< "\n\n"; 

    std::cout << "\n=========== END TEST ==========\n\n"; 
} 

std::string::size_type GetLengthWithUTF(std::string &sValue) 
{ 
    std::cout << "  +----+START GetLengthWithUTF\n"; 
    std::cout << "   +Input string is: " << sValue << "\n"; 
    std::string::size_type i; 
    std::cout << "   +Start cycle\n"; 
    int iCountUTF8characters = 0; 
    for (i = 0; i < sValue.length(); i++) 
    { 
     std::cout << "   +----+Iteration N " << i << "\n"; 
     std::cout << "    +Current character is: " << sValue[i] << ", integer value = " << (int)sValue[i] << "\n"; 
     if (sValue[i] > 127) 
     { 
      iCountUTF8characters++; 
      std::cout << "    +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: " << iCountUTF8characters << "\n"; 
     } 
     else 
     { 
      std::cout << "    +----+If statement (sValue[i] > 127) is false.\n"; 
     } 
    } 

    std::cout << "   +End cycle\n"; 
    iCountUTF8characters = iCountUTF8characters/2; 
    std::cout << "   +Return sValue.length() - (iCountUTF8characters/2) ---> " << sValue.length() << " - (" << iCountUTF8characters << "/2) = " << (sValue.length() - (std::string::size_type)iCountUTF8characters) <<"\n"; 
    std::cout << "  +----+ASCIID GetLengthWithUTF\n"; 
    return (sValue.length() - (std::string::size_type)iCountUTF8characters); 
} 

Console kompilieren Befehle:

AIX 6

g++ -o test test.cpp 

RHEL Server 6.7 Santiago

g++ -o test test.cpp 

Microsoft W indows v10.0.14393

cl /EHsc test.cpp 



Ergebnisse:

AIX 6

=========== START TEST ========== 

+TEST UTF8 STRING 
+----+Bytes of string (sTestValueUTF8.length()) = 6 
    +----+START GetLengthWithUTF 
      +Input string is: жжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 1 
      +----+Iteration N 1 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 2 
      +----+Iteration N 2 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 3 
      +----+Iteration N 3 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 4 
      +----+Iteration N 4 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 5 
      +----+Iteration N 5 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 6 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 6 - (3/2) = 3 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("жжж")) = 3 

+TEST ASCII STRING 
+----+Bytes of string (sTestValueASCII.length()) = 3 
    +----+START GetLengthWithUTF 
      +Input string is: ggg 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 3 - (0/2) = 3 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("ggg")) = 3 

+TEST MIX STRING 
+----+Bytes of string (sTestValueMIX.length()) = 9 
    +----+START GetLengthWithUTF 
      +Input string is: gggжжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 3 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 1 
      +----+Iteration N 4 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 2 
      +----+Iteration N 5 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 3 
      +----+Iteration N 6 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 4 
      +----+Iteration N 7 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 5 
      +----+Iteration N 8 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 6 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 9 - (3/2) = 6 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("gggжжж")) = 6 


=========== END TEST ========== 

RHEL Server 6.7 Santiago

=========== START TEST ========== 

+TEST UTF8 STRING 
+----+Bytes of string (sTestValueUTF8.length()) = 6 
    +----+START GetLengthWithUTF 
      +Input string is: жжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 3 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 4 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 5 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 6 - (0/2) = 6 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("жжж")) = 6 

+TEST ASCII STRING 
+----+Bytes of string (sTestValueASCII.length()) = 3 
    +----+START GetLengthWithUTF 
      +Input string is: ggg 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 3 - (0/2) = 3 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("ggg")) = 3 

+TEST MIX STRING 
+----+Bytes of string (sTestValueMIX.length()) = 9 
    +----+START GetLengthWithUTF 
      +Input string is: gggжжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 3 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 4 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 5 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 6 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 7 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 8 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 9 - (0/2) = 9 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("gggжжж")) = 9 


=========== END TEST ========== 

Microsoft Windows v10.0.14393

=========== START TEST ========== 

+TEST UTF8 STRING 
+----+Bytes of string (sTestValueUTF8.length()) = 6 
    +----+START GetLengthWithUTF 
      +Input string is: жжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 3 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 4 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 5 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 6 - (0/2) = 6 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("жжж")) = 6 

+TEST ASCII STRING 
+----+Bytes of string (sTestValueASCII.length()) = 3 
    +----+START GetLengthWithUTF 
      +Input string is: ggg 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 3 - (0/2) = 3 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("ggg")) = 3 

+TEST MIX STRING 
+----+Bytes of string (sTestValueMIX.length()) = 9 
    +----+START GetLengthWithUTF 
      +Input string is: gggжжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 3 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 4 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 5 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 6 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 7 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 8 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 9 - (0/2) = 9 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("gggжжж")) = 9 


=========== END TEST ========== 

Der Algorithmus muss die Anzahl der Zeichen in einer Zeichenfolge berechnen. Wie Sie den Ergebnissen der Tests entnehmen können, funktioniert es nur unter AIX korrekt.

Ich bin froh, wenn mir jemand helfen kann, dieses absurde Verhalten des Algorithmus für verschiedene Betriebssysteme zu verstehen. Der Algorithmus wurde unter OS AIX erstellt. Nach der Migration von AIX nach LINUX stellt sich heraus, dass ein Problem damit besteht und ich habe umfangreichere Tests durchgeführt, deren Ergebnisse Sie sehen. Meine Hauptfrage ist, wie verdammt Algorithmus unter AIX funktioniert. Ich kann es nicht logisch erklären.

+2

Dieser Algorithmus ist falsch; Es funktioniert nur mit einer kleinen Untermenge von Unicode. Ein besserer Algorithmus ist es, die Anzahl der Bytes so zu zählen, dass 'ch & 0xC0! = 0x80', was nur nicht-initiale Codes (solche im Bereich 0x80-0xBF) eliminiert. – rici

+0

Ja, du hast Recht. Dieser Algorithmus ist veraltet, sehr alt und prüft Zeichenfolgen unter 200 Zeichen. Er hat jedoch den Algorithmus wie oben beschrieben geändert. Ich war nur interessant das Problem zu kennen. – stoyanov

Antwort

4

Es scheint, dass die beiden Arten von System sich in der Art und Weise unterscheiden, wie sie das Zeichen der Zeichen behandeln, was vom Standard erlaubt ist. Ihr AIX-Compiler behandelt char s als unsigniert, während die anderen beiden Systeme sie als signiert behandeln.

Auf Systemen mit unsignierten Zeichen verhält sich die Bedingung sValue[i] > 127 genau so, wie man es erwarten würde. Derselbe Ausdruck ist jedoch auf Systemen mit vorzeichenbehafteten Zeichen nicht erfolgreich.

Deshalb erhalten Sie negative Zahlen für Zeichen mit Codes von 128 und höher. Zum Beispiel wird 208 zu -48, wenn es als ein vorzeichenbehafteter Wert behandelt wird.

Sie können dieses Problem beheben, indem eine Umwandlung in unsigned zwingen oder durch die acht Bit mit einer Bitmaske Überprüfung:

if (sValue[i] & 128) { 
    ... // MSB is set 
} 
+0

Verdammt! Stimmt! Vielen Dank!!! if (unsigned (sValue [i])> 127) ist eine langsame Version, aber für einige Entwickler besser lesbar;) – stoyanov

Verwandte Themen