@@ -16,6 +16,11 @@ enum ItemType
16
16
ArgumentSeparator,
17
17
Statement,
18
18
StatementSeparator,
19
+ StringComponent,
20
+ StringSeparator,
21
+ StringWhitespace,
22
+ FormatSpecifier,
23
+ EscapeSequence,
19
24
Group,
20
25
Container,
21
26
StartOfContainer,
@@ -250,6 +255,161 @@ static vector<Item> CreateStatementItems(const vector<Item>& items)
250
255
return result;
251
256
}
252
257
258
+ static vector<InstructionTextToken> ParseStringToken (
259
+ const InstructionTextToken& unprocessedStringToken,
260
+ const size_t maxParsingLength)
261
+ {
262
+ const auto & src = unprocessedStringToken.text ;
263
+ const size_t tail = src.size ();
264
+
265
+ // Max parsing length set to max annotation length
266
+ if (tail > maxParsingLength)
267
+ return { unprocessedStringToken };
268
+ vector<InstructionTextToken> result;
269
+ size_t curStart = 0 , curEnd = 0 ;
270
+
271
+ auto ConstructToken = [&](size_t start, size_t end) {
272
+ InstructionTextToken token = unprocessedStringToken;
273
+ const string newTxt = string (src.substr (start, end - start));
274
+ token.text = newTxt;
275
+ token.width = newTxt.size ();
276
+ result.emplace_back (token);
277
+ };
278
+
279
+ auto flushToken = [&](size_t start, size_t end)
280
+ {
281
+ if (start < end)
282
+ ConstructToken (start, end);
283
+ };
284
+
285
+ // We generally split along spaces while keeping words intact, but some cases have
286
+ // specific splitting behavior:
287
+ //
288
+ // - Any format specifier (starting with %) will be treated as an atom even if embedded
289
+ // within a word
290
+ // - Any escape sequence will also be treated as an atom
291
+ // - We split along punctuation like commas, colons, periods, and semicolons, grouping
292
+ // trailing punctuation together.
293
+ while (curEnd < tail)
294
+ {
295
+ char c = src[curEnd];
296
+
297
+ if (c == ' %' )
298
+ {
299
+ // Flush before format specifier
300
+ flushToken (curStart, curEnd);
301
+
302
+ size_t start = curEnd;
303
+ curEnd++;
304
+ while (curEnd < tail && (isalnum (src[curEnd]) || src[curEnd]==' .' || src[curEnd]==' -' ))
305
+ curEnd++;
306
+ ConstructToken (start, curEnd);
307
+ curStart = curEnd;
308
+ }
309
+ else if (c == ' \\ ' )
310
+ {
311
+ // Flush before escape sequence
312
+ flushToken (curStart, curEnd);
313
+
314
+ size_t start = curEnd;
315
+ curEnd++; // consume '\'
316
+ if (curEnd < tail)
317
+ curEnd++; // consume escaped char
318
+ ConstructToken (start, curEnd);
319
+ curStart = curEnd;
320
+ }
321
+ else if (c == ' ,' || c == ' .' || c == ' :' || c == ' ;' || isspace (c))
322
+ {
323
+ // Flush before punctuation
324
+ flushToken (curStart, curEnd);
325
+
326
+ // Group together repeated punctuation
327
+ size_t start = curEnd;
328
+ while (curEnd < tail && src[curEnd] == c)
329
+ curEnd++;
330
+ ConstructToken (start, curEnd);
331
+ curStart = curEnd;
332
+ }
333
+ else
334
+ {
335
+ curEnd++;
336
+ }
337
+ }
338
+
339
+ flushToken (curStart, curEnd);
340
+ return result;
341
+ }
342
+
343
+ static vector<Item> CreateStringGroups (const vector<Item>& items)
344
+ {
345
+ vector<Item> result, pending;
346
+ bool hasStrings = false ;
347
+ for (auto & i : items)
348
+ {
349
+ if (i.type == StringSeparator && !i.tokens .empty ())
350
+ {
351
+ // We try to push separators onto a preceding word, otherwise treat as
352
+ // a singular atom
353
+ if (pending.empty ())
354
+ {
355
+ result.push_back (Item {Atom, {}, {i.tokens }, 0 });
356
+ }
357
+ else
358
+ {
359
+ for (auto & j : i.tokens )
360
+ pending.back ().AddTokenToLastAtom (j);
361
+ result.push_back (Item {StringComponent, pending, {}, 0 });
362
+ }
363
+ pending.clear ();
364
+ hasStrings = true ;
365
+ }
366
+ else if (i.type == StringWhitespace)
367
+ {
368
+ // Special case because we let whitespace trail even if over width
369
+ if (!pending.empty ())
370
+ {
371
+ result.push_back (Item {StringComponent, pending, {}, 0 });
372
+ pending.clear ();
373
+ }
374
+ result.push_back (Item {StringWhitespace, i.items , i.tokens , i.width });
375
+ }
376
+ else if (i.type == FormatSpecifier || i.type == EscapeSequence)
377
+ {
378
+ // Flush previous tokens before special sequences like format specifiers or
379
+ // escape sequences
380
+ if (!pending.empty ())
381
+ {
382
+ result.push_back (Item {StringComponent, pending, {}, 0 });
383
+ pending.clear ();
384
+ }
385
+ result.push_back (Item { Atom, i.items , i.tokens , i.width });
386
+ }
387
+
388
+ else if (i.type == StartOfContainer && pending.empty ())
389
+ {
390
+ result.push_back (i);
391
+ }
392
+ else if (i.type == EndOfContainer && hasStrings && !pending.empty ())
393
+ {
394
+ result.push_back (Item {StringComponent, pending, {}, 0 });
395
+ result.push_back (i);
396
+ }
397
+ else
398
+ {
399
+ pending.push_back (Item {i.type , CreateStringGroups (i.items ), i.tokens , 0 });
400
+ }
401
+ }
402
+
403
+ if (!pending.empty ())
404
+ {
405
+ if (hasStrings)
406
+ result.push_back (Item {StringComponent, pending, {}, 0 });
407
+ else
408
+ result.insert (result.end (), pending.begin (), pending.end ());
409
+ }
410
+
411
+ return result;
412
+ }
253
413
254
414
static vector<Item> CreateAssignmentOperatorGroups (const vector<Item>& items)
255
415
{
@@ -576,8 +736,7 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
576
736
size_t tokenIndex = indentationTokens.size ();
577
737
578
738
// First break the line down into nested container items. A container is anything between a pair of
579
- // BraceTokens (except for strings, where the entire string, including the quotes, are treated as
580
- // a single atom).
739
+ // BraceTokens
581
740
vector<Item> items;
582
741
stack<vector<Item>> itemStack;
583
742
for (; tokenIndex < currentLine.tokens .size (); tokenIndex++)
@@ -588,29 +747,30 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
588
747
switch (token.type )
589
748
{
590
749
case BraceToken:
750
+ // Beginning of string
591
751
if (tokenIndex + 1 < currentLine.tokens .size ()
592
752
&& currentLine.tokens [tokenIndex + 1 ].type == StringToken)
593
753
{
594
- // Treat string tokens surrounded by brace tokens as a unit (this is usually the quotes
595
- // surrounding the string)
596
- Item atom;
597
- atom.type = Atom;
598
- atom.tokens .push_back (token);
599
- atom.tokens .push_back (currentLine.tokens [tokenIndex + 1 ]);
600
- atom.width = 0 ;
601
- tokenIndex++;
602
- if (tokenIndex + 1 < currentLine.tokens .size ()
603
- && currentLine.tokens [tokenIndex + 1 ].type == BraceToken)
604
- {
605
- atom.tokens .push_back (currentLine.tokens [tokenIndex + 1 ]);
606
- tokenIndex++;
607
- }
754
+ // Create a ContainerContents item and place it onto the item stack. This will hold anything
755
+ // inside the container once the end of the container is found.
756
+ items.push_back (Item {Container, {}, {}, 0 });
757
+ itemStack.push (items);
608
758
609
- items.push_back (atom);
610
- break ;
759
+ // Starting a new context
760
+ items.clear ();
761
+ items.push_back (Item {StartOfContainer, {}, {token}, 0 });
611
762
}
612
-
613
- if (trimmedText == " (" || trimmedText == " [" || trimmedText == " {" )
763
+ // End of string
764
+ else if (currentLine.tokens [tokenIndex].type == StringToken
765
+ && tokenIndex + 1 < currentLine.tokens .size ()
766
+ && currentLine.tokens [tokenIndex + 1 ].type == BraceToken)
767
+ {
768
+ // Create a ContainerContents item and place it onto the item stack. This will hold anything
769
+ // inside the container once the end of the container is found.
770
+ items.push_back (Item {Container, {}, {}, 0 });
771
+ itemStack.push (items);
772
+ }
773
+ else if (trimmedText == " (" || trimmedText == " [" || trimmedText == " {" )
614
774
{
615
775
// Create a ContainerContents item and place it onto the item stack. This will hold anything
616
776
// inside the container once the end of the container is found.
@@ -663,6 +823,25 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
663
823
else
664
824
items.push_back (Item {Operator, {}, {token}, 0 });
665
825
break ;
826
+ case StringToken:
827
+ {
828
+ vector<InstructionTextToken> stringTokens = ParseStringToken (token, settings.maximumAnnotationLength );
829
+ for (auto subToken : stringTokens)
830
+ {
831
+ string trimmedSubText = TrimString (subToken.text );
832
+ if (trimmedSubText.empty ())
833
+ items.push_back (Item {StringWhitespace, {}, {subToken}, 0 });
834
+ if (trimmedSubText[0 ] == ' %' )
835
+ items.push_back (Item {FormatSpecifier, {}, {subToken}, 0 });
836
+ else if (!trimmedSubText.empty () && trimmedSubText[0 ] == ' \\ ' )
837
+ items.push_back (Item {EscapeSequence, {}, {subToken}, 0 });
838
+ else if (trimmedSubText[0 ] == ' ,' || trimmedSubText[0 ] == ' .' || trimmedSubText[0 ] == ' :' || trimmedSubText[0 ] == ' ;' )
839
+ items.push_back (Item {StringSeparator, {}, {subToken}, 0 });
840
+ else
841
+ items.push_back (Item {Atom, {}, {subToken}, 0 });
842
+ }
843
+ break ;
844
+ }
666
845
default :
667
846
items.push_back (Item {Atom, {}, {token}, 0 });
668
847
break ;
@@ -699,6 +878,10 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
699
878
// the previous atom.
700
879
items = RelocateStartAndEndOfContainerItems (items);
701
880
881
+ // Create internal groupings for displaying strings -- grouping items by punctuation, format specifiers, and
882
+ // escape sequences
883
+ items = CreateStringGroups (items);
884
+
702
885
// Now that items are done, compute widths for layout
703
886
for (auto & j : items)
704
887
j.CalculateWidth ();
@@ -754,9 +937,16 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
754
937
755
938
for (auto item = items.begin (); item != items.end ();)
756
939
{
757
- if (currentWidth + item->width > desiredWidth)
940
+ if (item->type == StringComponent && currentWidth + item->width > desiredWidth)
941
+ {
942
+ // If a string is too wide to fit on the current line, create a newline
943
+ // without additional indentation
944
+ newLine ();
945
+ }
946
+ else if (currentWidth + item->width > desiredWidth && item->type != StringWhitespace)
758
947
{
759
948
// Current item is too wide to fit on the current line, will need to start a new line.
949
+ // Whitespace is allowed to be too wide; we push it on as the preceding word is wrapped.
760
950
auto next = item;
761
951
++next;
762
952
0 commit comments