Skip to content

Commit 86f8881

Browse files
authored
Fix UTF8 encoding of invalid character (#3191)
1 parent 5be3425 commit 86f8881

File tree

2 files changed

+647
-399
lines changed

2 files changed

+647
-399
lines changed

src/CLR/CorLib/corlib_native_System_String.cpp

Lines changed: 106 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,59 @@ HRESULT Library_corlib_native_System_String::ToCharArray(CLR_RT_StackFrame &stac
585585
NANOCLR_NOCLEANUP();
586586
}
587587

588+
// Helper function for comparing UTF-8 substrings
589+
bool MatchString(CLR_RT_UnicodeHelper &inputIter, const char *searchStr, int searchCharLen)
590+
{
591+
// Create copies to preserve original iterator state
592+
CLR_RT_UnicodeHelper inputCopy = inputIter;
593+
CLR_RT_UnicodeHelper searchIter;
594+
searchIter.SetInputUTF8(searchStr);
595+
596+
for (int i = 0; i < searchCharLen; i++)
597+
{
598+
CLR_UINT16 bufInput[3] = {0};
599+
CLR_UINT16 bufSearch[3] = {0};
600+
601+
// Set up buffers for character conversion
602+
inputCopy.m_outputUTF16 = bufInput;
603+
inputCopy.m_outputUTF16_size = MAXSTRLEN(bufInput);
604+
searchIter.m_outputUTF16 = bufSearch;
605+
searchIter.m_outputUTF16_size = MAXSTRLEN(bufSearch);
606+
607+
// Convert next character from input
608+
if (!inputCopy.ConvertFromUTF8(1, false))
609+
{
610+
// Input ended prematurely
611+
return false;
612+
}
613+
614+
// Convert next character from search string
615+
if (!searchIter.ConvertFromUTF8(1, false))
616+
{
617+
// Shouldn't happen for valid search string
618+
return false;
619+
}
620+
621+
// Compare first UTF-16 code unit
622+
if (bufInput[0] != bufSearch[0])
623+
{
624+
return false;
625+
}
626+
627+
// Handle surrogate pairs (4-byte UTF-8 sequences)
628+
if (bufInput[0] >= 0xD800 && bufInput[0] <= 0xDBFF)
629+
{
630+
// High surrogate
631+
if (bufInput[1] != bufSearch[1])
632+
{
633+
// Low surrogate mismatch
634+
return false;
635+
}
636+
}
637+
}
638+
return true;
639+
}
640+
588641
HRESULT Library_corlib_native_System_String::IndexOf(CLR_RT_StackFrame &stack, int mode)
589642
{
590643
NATIVE_PROFILE_CLR_CORE();
@@ -594,8 +647,8 @@ HRESULT Library_corlib_native_System_String::IndexOf(CLR_RT_StackFrame &stack, i
594647
int startIndex;
595648
int count;
596649
int pos;
597-
const char *pString;
598-
const CLR_UINT16 *pChars;
650+
const char *pString = NULL;
651+
const CLR_UINT16 *pChars = NULL;
599652
int iChars = 0;
600653
CLR_RT_UnicodeHelper inputIterator;
601654
int inputLen;
@@ -605,8 +658,6 @@ HRESULT Library_corlib_native_System_String::IndexOf(CLR_RT_StackFrame &stack, i
605658
if (!szText)
606659
szText = "";
607660
pos = -1;
608-
pString = NULL;
609-
pChars = NULL;
610661

611662
if (mode & c_IndexOf__SingleChar)
612663
{
@@ -617,23 +668,20 @@ HRESULT Library_corlib_native_System_String::IndexOf(CLR_RT_StackFrame &stack, i
617668
{
618669
CLR_RT_HeapBlock_Array *array = stack.Arg1().DereferenceArray();
619670
FAULT_ON_NULL(array);
620-
621671
pChars = (const CLR_UINT16 *)array->GetFirstElement();
622672
iChars = array->m_numOfElements;
623673
}
624674
else if (mode & c_IndexOf__String)
625675
{
626676
pString = stack.Arg1().RecoverString();
627677
FAULT_ON_NULL(pString);
628-
// how long is the search string?
629678
inputIterator.SetInputUTF8(pString);
630679
searchLen = inputIterator.CountNumberOfCharacters();
631680
}
632681

633-
// calculate input string length
682+
// Calculate input length
634683
inputIterator.SetInputUTF8(szText);
635684
inputLen = inputIterator.CountNumberOfCharacters();
636-
637685
if (0 == inputLen)
638686
{
639687
pos = -1;
@@ -647,7 +695,6 @@ HRESULT Library_corlib_native_System_String::IndexOf(CLR_RT_StackFrame &stack, i
647695
}
648696
else
649697
{
650-
// for mode LastIndex... we are searching backwards toward the start of the string
651698
if (mode & c_IndexOf__Last)
652699
{
653700
startIndex = inputLen - 1;
@@ -663,49 +710,53 @@ HRESULT Library_corlib_native_System_String::IndexOf(CLR_RT_StackFrame &stack, i
663710
NANOCLR_SET_AND_LEAVE(CLR_E_OUT_OF_RANGE);
664711

665712
// for mode LastIndex... with string we move the start index back by the search string length -1
713+
// if we search forward
666714
if ((mode & c_IndexOf__String_Last) == c_IndexOf__String_Last)
667715
{
668716
startIndex -= searchLen - 1;
669-
// check the start index; if not in range skip the search
717+
// check the start index; if not in range, skip the search
670718
if (startIndex < 0 || startIndex > inputLen)
719+
{
671720
goto Exit;
721+
}
672722
}
673723

674724
// calculate the iteration count
675725
if (mode & c_IndexOf__Count)
676726
{
677-
// count form parameter
727+
// count (from parameter)
678728
count = stack.Arg3().NumericByRefConst().s4;
679729
}
680730
else
681731
{
682732
// for mode LastIndex... we are searching from start index backwards toward the start of the string
683733
if (mode & c_IndexOf__Last)
684734
{
685-
// backward until the start of string
686-
// one more time than the startIndex because we should iterate until zero
735+
// backwards until the start of the string
736+
// one position ahead of the startIndex because we should iterate until position zero
687737
count = startIndex + 1;
688738
}
689739
else
690740
{
691-
// forward until the end of string
741+
// move forward until reaching the end of the string
692742
count = inputLen - startIndex;
693743
}
694744
}
695745

696-
// for mode with string we reduce the count by the search string length -1
697-
// if we search foreward
746+
// forward search with index of string mode: adjust the count by the search string length -1
698747
if ((mode & c_IndexOf__String_Last) == c_IndexOf__String)
699748
{
700749
count -= searchLen - 1;
701750
}
702751

703-
// check the count
752+
// validate count
704753
if (mode & c_IndexOf__Last)
705754
{
706755
// check for backward mode; no exception; just exit
707756
if (count > startIndex + 1)
757+
{
708758
goto Exit;
759+
}
709760
}
710761
else
711762
{
@@ -717,132 +768,87 @@ HRESULT Library_corlib_native_System_String::IndexOf(CLR_RT_StackFrame &stack, i
717768
// First move to the character, then read it.
718769
if (inputIterator.ConvertFromUTF8(startIndex, true))
719770
{
720-
// string mode?
771+
// String search mode
721772
if (pString)
722773
{
723-
// iterate thru all positions
724774
while (count-- > 0)
725775
{
726-
CLR_RT_UnicodeHelper inputString;
727-
inputString.SetInputUTF8((const char *)inputIterator.m_inputUTF8);
728-
CLR_RT_UnicodeHelper searchString;
729-
searchString.SetInputUTF8(pString);
730-
bool finished = false;
731-
732-
while (true)
733-
{
734-
CLR_UINT16 bufInput[3];
735-
CLR_UINT16 bufSearch[3];
736-
737-
inputString.m_outputUTF16 = bufInput;
738-
inputString.m_outputUTF16_size = MAXSTRLEN(bufInput);
739-
740-
searchString.m_outputUTF16 = bufSearch;
741-
searchString.m_outputUTF16_size = MAXSTRLEN(bufSearch);
742-
743-
// read next char from search string; if no more chars to read (false)
744-
// then we are done and found the search string in the input string
745-
if (searchString.ConvertFromUTF8(1, false) == false)
746-
{
747-
pos = startIndex;
748-
finished = true;
749-
break;
750-
}
751-
752-
// read the next char from the input string; if no more chars to read (false)
753-
// we didn't found the search string in the input string; we abort the search now
754-
if (inputString.ConvertFromUTF8(1, false) == false)
755-
{
756-
finished = true;
757-
break;
758-
}
759-
760-
// does the char from input not match the char from the search string
761-
if (bufInput[0] != bufSearch[0])
762-
{
763-
// next iteration round but not finished
764-
break;
765-
}
766-
}
767-
768-
// finished (with or without a found) then break
769-
if (finished)
776+
// Use helper for proper UTF-8 comparison
777+
if (MatchString(inputIterator, pString, searchLen))
770778
{
779+
pos = startIndex;
771780
break;
772781
}
773782

774-
// reading forward or backward
783+
// Move to next candidate position (both forward or backward reading)
775784
if (mode & c_IndexOf__Last)
776785
{
777786
startIndex--;
778-
// move one chars backward
779-
if (inputIterator.MoveBackwardInUTF8(szText, 1) == false)
787+
// move backwards one char
788+
if (!inputIterator.MoveBackwardInUTF8(szText, 1))
780789
{
781790
break;
782791
}
783792
}
784793
else
785794
{
786795
startIndex++;
787-
// move to the next char
788-
if (inputIterator.ConvertFromUTF8(1, true) == false)
796+
// move forward to the next char
797+
if (!inputIterator.ConvertFromUTF8(1, true))
789798
{
790799
break;
791800
}
792801
}
793802
}
794803
}
795804

796-
// char mode?
797-
if (pChars)
805+
// Character search mode
806+
else if (pChars)
798807
{
799-
// iterate thru all positions
808+
// iterate through all positions
800809
while (count-- > 0)
801810
{
802-
CLR_UINT16 buf[3];
811+
CLR_UINT16 buf[3] = {0};
803812

804813
inputIterator.m_outputUTF16 = buf;
805814
inputIterator.m_outputUTF16_size = MAXSTRLEN(buf);
806815

807816
// read the next char from the input string; if no more chars to read (false)
808-
// we didn't found the search chars in the input string
809-
if (inputIterator.ConvertFromUTF8(1, false) == false)
817+
// the search chars weren't found in the input string
818+
if (!inputIterator.ConvertFromUTF8(1, false))
810819
{
811820
break;
812821
}
813822

814-
// test each search char if it's a match
823+
// test each search char for a match
815824
for (int i = 0; i < iChars; i++)
816825
{
817-
// match?
818826
if (buf[0] == pChars[i])
819827
{
820-
// position found!
828+
// found position for next char
821829
pos = startIndex;
822830
break;
823831
}
824832
}
825833

826-
// found? => break
834+
// didn't find any, break
827835
if (pos != -1)
828836
{
829837
break;
830838
}
831839

832-
// for mode LastIndex... we are searching from start index backwards toward the start of the string
840+
// for search mode LastIndex: we are searching from start index backwards toward the start of the string
833841
if (mode & c_IndexOf__Last)
834842
{
835-
// in backward mode
843+
// backwards mode
836844
startIndex--;
837-
// move two chars backward, because the current char is already read
838-
if (inputIterator.MoveBackwardInUTF8(szText, 2) == false)
839-
{
845+
// have to move two chars backwards, because the current char is already read
846+
if (!inputIterator.MoveBackwardInUTF8(szText, 2))
840847
break;
841-
}
842848
}
843849
else
844850
{
845-
// forward mode; simple advance the start index
851+
// forward mode: just advance the start index
846852
startIndex++;
847853
}
848854
}
@@ -851,7 +857,6 @@ HRESULT Library_corlib_native_System_String::IndexOf(CLR_RT_StackFrame &stack, i
851857

852858
Exit:
853859
stack.SetResult_I4(pos);
854-
855860
NANOCLR_NOCLEANUP();
856861
}
857862

@@ -889,10 +894,11 @@ HRESULT Library_corlib_native_System_String::ChangeCase(CLR_RT_StackFrame &stack
889894
*ptr++ = c;
890895
}
891896

892-
NANOCLR_CHECK_HRESULT(CLR_RT_HeapBlock_String::CreateInstance(
893-
stack.PushValue(),
894-
(CLR_UINT16 *)arrayTmp->GetFirstElement(),
895-
arrayTmp->m_numOfElements));
897+
NANOCLR_CHECK_HRESULT(
898+
CLR_RT_HeapBlock_String::CreateInstance(
899+
stack.PushValue(),
900+
(CLR_UINT16 *)arrayTmp->GetFirstElement(),
901+
arrayTmp->m_numOfElements));
896902

897903
NANOCLR_NOCLEANUP();
898904
}
@@ -923,10 +929,11 @@ HRESULT Library_corlib_native_System_String::Substring(CLR_RT_StackFrame &stack,
923929
NANOCLR_SET_AND_LEAVE(CLR_E_OUT_OF_RANGE);
924930
}
925931

926-
NANOCLR_CHECK_HRESULT(CLR_RT_HeapBlock_String::CreateInstance(
927-
stack.PushValue(),
928-
(CLR_UINT16 *)arrayTmp->GetElement(startIndex),
929-
length));
932+
NANOCLR_CHECK_HRESULT(
933+
CLR_RT_HeapBlock_String::CreateInstance(
934+
stack.PushValue(),
935+
(CLR_UINT16 *)arrayTmp->GetElement(startIndex),
936+
length));
930937

931938
NANOCLR_NOCLEANUP();
932939
}
@@ -1102,10 +1109,11 @@ HRESULT Library_corlib_native_System_String::Split(CLR_RT_StackFrame &stack, CLR
11021109
{
11031110
CLR_RT_HeapBlock *str = (CLR_RT_HeapBlock *)arrayDst->GetElement(count);
11041111

1105-
NANOCLR_CHECK_HRESULT(CLR_RT_HeapBlock_String::CreateInstance(
1106-
*str,
1107-
pSrcStart,
1108-
(CLR_UINT32)(pSrc - pSrcStart)));
1112+
NANOCLR_CHECK_HRESULT(
1113+
CLR_RT_HeapBlock_String::CreateInstance(
1114+
*str,
1115+
pSrcStart,
1116+
(CLR_UINT32)(pSrc - pSrcStart)));
11091117

11101118
pSrcStart = pSrc + 1;
11111119
}

0 commit comments

Comments
 (0)