Skip to content

Commit 72fc065

Browse files
committed
Add support for string wrapping in formatter
1 parent df2691a commit 72fc065

File tree

4 files changed

+215
-21
lines changed

4 files changed

+215
-21
lines changed

binaryninjaapi.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14274,6 +14274,7 @@ namespace BinaryNinja {
1427414274
size_t desiredLineLength;
1427514275
size_t minimumContentLength;
1427614276
size_t tabWidth;
14277+
size_t maximumAnnotationLength;
1427714278
std::string languageName;
1427814279
std::string commentStartString;
1427914280
std::string commentEndString;

binaryninjacore.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3669,6 +3669,7 @@ extern "C"
36693669
size_t desiredLineLength;
36703670
size_t minimumContentLength;
36713671
size_t tabWidth;
3672+
size_t maximumAnnotationLength;
36723673
char* languageName;
36733674
char* commentStartString;
36743675
char* commentEndString;

formatter/generic/genericformatter.cpp

Lines changed: 211 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ enum ItemType
1616
ArgumentSeparator,
1717
Statement,
1818
StatementSeparator,
19+
StringComponent,
20+
StringSeparator,
21+
StringWhitespace,
22+
FormatSpecifier,
23+
EscapeSequence,
1924
Group,
2025
Container,
2126
StartOfContainer,
@@ -250,6 +255,161 @@ static vector<Item> CreateStatementItems(const vector<Item>& items)
250255
return result;
251256
}
252257

258+
static vector<InstructionTextToken> ParseStringToken(
259+
const InstructionTextToken& unprocessedStringToken,
260+
const size_t maxParsingLength)
261+
{
262+
const auto& src = unprocessedStringToken.text;
263+
const size_t tail = src.size();
264+
265+
// Max parsing length set to max annotation length
266+
if (tail > maxParsingLength)
267+
return { unprocessedStringToken };
268+
vector<InstructionTextToken> result;
269+
size_t curStart = 0, curEnd = 0;
270+
271+
auto ConstructToken = [&](size_t start, size_t end) {
272+
InstructionTextToken token = unprocessedStringToken;
273+
const string newTxt = string(src.substr(start, end - start));
274+
token.text = newTxt;
275+
token.width = newTxt.size();
276+
result.emplace_back(token);
277+
};
278+
279+
auto flushToken = [&](size_t start, size_t end)
280+
{
281+
if (start < end)
282+
ConstructToken(start, end);
283+
};
284+
285+
// We generally split along spaces while keeping words intact, but some cases have
286+
// specific splitting behavior:
287+
//
288+
// - Any format specifier (starting with %) will be treated as an atom even if embedded
289+
// within a word
290+
// - Any escape sequence will also be treated as an atom
291+
// - We split along punctuation like commas, colons, periods, and semicolons, grouping
292+
// trailing punctuation together.
293+
while (curEnd < tail)
294+
{
295+
char c = src[curEnd];
296+
297+
if (c == '%')
298+
{
299+
// Flush before format specifier
300+
flushToken(curStart, curEnd);
301+
302+
size_t start = curEnd;
303+
curEnd++;
304+
while (curEnd < tail && (isalnum(src[curEnd]) || src[curEnd]=='.' || src[curEnd]=='-'))
305+
curEnd++;
306+
ConstructToken(start, curEnd);
307+
curStart = curEnd;
308+
}
309+
else if (c == '\\')
310+
{
311+
// Flush before escape sequence
312+
flushToken(curStart, curEnd);
313+
314+
size_t start = curEnd;
315+
curEnd++; // consume '\'
316+
if (curEnd < tail)
317+
curEnd++; // consume escaped char
318+
ConstructToken(start, curEnd);
319+
curStart = curEnd;
320+
}
321+
else if (c == ',' || c == '.' || c == ':' || c == ';' || isspace(c))
322+
{
323+
// Flush before punctuation
324+
flushToken(curStart, curEnd);
325+
326+
// Group together repeated punctuation
327+
size_t start = curEnd;
328+
while (curEnd < tail && src[curEnd] == c)
329+
curEnd++;
330+
ConstructToken(start, curEnd);
331+
curStart = curEnd;
332+
}
333+
else
334+
{
335+
curEnd++;
336+
}
337+
}
338+
339+
flushToken(curStart, curEnd);
340+
return result;
341+
}
342+
343+
static vector<Item> CreateStringGroups(const vector<Item>& items)
344+
{
345+
vector<Item> result, pending;
346+
bool hasStrings = false;
347+
for (auto& i : items)
348+
{
349+
if (i.type == StringSeparator && !i.tokens.empty())
350+
{
351+
// We try to push separators onto a preceding word, otherwise treat as
352+
// a singular atom
353+
if (pending.empty())
354+
{
355+
result.push_back(Item {Atom, {}, {i.tokens}, 0});
356+
}
357+
else
358+
{
359+
for (auto& j : i.tokens)
360+
pending.back().AddTokenToLastAtom(j);
361+
result.push_back(Item {StringComponent, pending, {}, 0});
362+
}
363+
pending.clear();
364+
hasStrings = true;
365+
}
366+
else if (i.type == StringWhitespace)
367+
{
368+
// Special case because we let whitespace trail even if over width
369+
if (!pending.empty())
370+
{
371+
result.push_back(Item {StringComponent, pending, {}, 0});
372+
pending.clear();
373+
}
374+
result.push_back(Item {StringWhitespace, i.items, i.tokens, i.width});
375+
}
376+
else if (i.type == FormatSpecifier || i.type == EscapeSequence)
377+
{
378+
// Flush previous tokens before special sequences like format specifiers or
379+
// escape sequences
380+
if (!pending.empty())
381+
{
382+
result.push_back(Item {StringComponent, pending, {}, 0 });
383+
pending.clear();
384+
}
385+
result.push_back(Item { Atom, i.items, i.tokens, i.width});
386+
}
387+
388+
else if (i.type == StartOfContainer && pending.empty())
389+
{
390+
result.push_back(i);
391+
}
392+
else if (i.type == EndOfContainer && hasStrings && !pending.empty())
393+
{
394+
result.push_back(Item {StringComponent, pending, {}, 0});
395+
result.push_back(i);
396+
}
397+
else
398+
{
399+
pending.push_back(Item {i.type, CreateStringGroups(i.items), i.tokens, 0});
400+
}
401+
}
402+
403+
if (!pending.empty())
404+
{
405+
if (hasStrings)
406+
result.push_back(Item {StringComponent, pending, {}, 0});
407+
else
408+
result.insert(result.end(), pending.begin(), pending.end());
409+
}
410+
411+
return result;
412+
}
253413

254414
static vector<Item> CreateAssignmentOperatorGroups(const vector<Item>& items)
255415
{
@@ -576,8 +736,7 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
576736
size_t tokenIndex = indentationTokens.size();
577737

578738
// First break the line down into nested container items. A container is anything between a pair of
579-
// BraceTokens (except for strings, where the entire string, including the quotes, are treated as
580-
// a single atom).
739+
// BraceTokens
581740
vector<Item> items;
582741
stack<vector<Item>> itemStack;
583742
for (; tokenIndex < currentLine.tokens.size(); tokenIndex++)
@@ -588,29 +747,30 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
588747
switch (token.type)
589748
{
590749
case BraceToken:
750+
// Beginning of string
591751
if (tokenIndex + 1 < currentLine.tokens.size()
592752
&& currentLine.tokens[tokenIndex + 1].type == StringToken)
593753
{
594-
// Treat string tokens surrounded by brace tokens as a unit (this is usually the quotes
595-
// surrounding the string)
596-
Item atom;
597-
atom.type = Atom;
598-
atom.tokens.push_back(token);
599-
atom.tokens.push_back(currentLine.tokens[tokenIndex + 1]);
600-
atom.width = 0;
601-
tokenIndex++;
602-
if (tokenIndex + 1 < currentLine.tokens.size()
603-
&& currentLine.tokens[tokenIndex + 1].type == BraceToken)
604-
{
605-
atom.tokens.push_back(currentLine.tokens[tokenIndex + 1]);
606-
tokenIndex++;
607-
}
754+
// Create a ContainerContents item and place it onto the item stack. This will hold anything
755+
// inside the container once the end of the container is found.
756+
items.push_back(Item {Container, {}, {}, 0});
757+
itemStack.push(items);
608758

609-
items.push_back(atom);
610-
break;
759+
// Starting a new context
760+
items.clear();
761+
items.push_back(Item {StartOfContainer, {}, {token}, 0});
611762
}
612-
613-
if (trimmedText == "(" || trimmedText == "[" || trimmedText == "{")
763+
// End of string
764+
else if (currentLine.tokens[tokenIndex].type == StringToken
765+
&& tokenIndex + 1 < currentLine.tokens.size()
766+
&& currentLine.tokens[tokenIndex + 1].type == BraceToken)
767+
{
768+
// Create a ContainerContents item and place it onto the item stack. This will hold anything
769+
// inside the container once the end of the container is found.
770+
items.push_back(Item {Container, {}, {}, 0});
771+
itemStack.push(items);
772+
}
773+
else if (trimmedText == "(" || trimmedText == "[" || trimmedText == "{")
614774
{
615775
// Create a ContainerContents item and place it onto the item stack. This will hold anything
616776
// inside the container once the end of the container is found.
@@ -663,6 +823,25 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
663823
else
664824
items.push_back(Item {Operator, {}, {token}, 0});
665825
break;
826+
case StringToken:
827+
{
828+
vector<InstructionTextToken> stringTokens = ParseStringToken(token, settings.maximumAnnotationLength);
829+
for (auto subToken : stringTokens)
830+
{
831+
string trimmedSubText = TrimString(subToken.text);
832+
if (trimmedSubText.empty())
833+
items.push_back(Item {StringWhitespace, {}, {subToken}, 0});
834+
if (trimmedSubText[0] == '%')
835+
items.push_back(Item {FormatSpecifier, {}, {subToken}, 0});
836+
else if (!trimmedSubText.empty() && trimmedSubText[0] == '\\')
837+
items.push_back(Item {EscapeSequence, {}, {subToken}, 0});
838+
else if (trimmedSubText[0] == ',' || trimmedSubText[0] == '.' || trimmedSubText[0] == ':' || trimmedSubText[0] == ';')
839+
items.push_back(Item {StringSeparator, {}, {subToken}, 0});
840+
else
841+
items.push_back(Item {Atom, {}, {subToken}, 0});
842+
}
843+
break;
844+
}
666845
default:
667846
items.push_back(Item {Atom, {}, {token}, 0});
668847
break;
@@ -699,6 +878,10 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
699878
// the previous atom.
700879
items = RelocateStartAndEndOfContainerItems(items);
701880

881+
// Create internal groupings for displaying strings -- grouping items by punctuation, format specifiers, and
882+
// escape sequences
883+
items = CreateStringGroups(items);
884+
702885
// Now that items are done, compute widths for layout
703886
for (auto& j : items)
704887
j.CalculateWidth();
@@ -754,9 +937,16 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
754937

755938
for (auto item = items.begin(); item != items.end();)
756939
{
757-
if (currentWidth + item->width > desiredWidth)
940+
if (item->type == StringComponent && currentWidth + item->width > desiredWidth)
941+
{
942+
// If a string is too wide to fit on the current line, create a newline
943+
// without additional indentation
944+
newLine();
945+
}
946+
else if (currentWidth + item->width > desiredWidth && item->type != StringWhitespace)
758947
{
759948
// Current item is too wide to fit on the current line, will need to start a new line.
949+
// Whitespace is allowed to be too wide; we push it on as the preceding word is wrapped.
760950
auto next = item;
761951
++next;
762952

lineformatter.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ LineFormatterSettings LineFormatterSettings::FromAPIObject(const BNLineFormatter
5353
result.desiredLineLength = settings->desiredLineLength;
5454
result.minimumContentLength = settings->minimumContentLength;
5555
result.tabWidth = settings->tabWidth;
56+
result.maximumAnnotationLength = settings->maximumAnnotationLength;
5657
result.languageName = settings->languageName;
5758
result.commentStartString = settings->commentStartString;
5859
result.commentEndString = settings->commentEndString;
@@ -69,6 +70,7 @@ BNLineFormatterSettings LineFormatterSettings::ToAPIObject() const
6970
result.desiredLineLength = desiredLineLength;
7071
result.minimumContentLength = minimumContentLength;
7172
result.tabWidth = tabWidth;
73+
result.maximumAnnotationLength = maximumAnnotationLength;
7274
result.languageName = (char*)languageName.c_str();
7375
result.commentStartString = (char*)commentStartString.c_str();
7476
result.commentEndString = (char*)commentEndString.c_str();

0 commit comments

Comments
 (0)