Skip to content

Commit 59eed8a

Browse files
committed
Auto merge of #90460 - pietroalbini:bidi-stable, r=nikomatsakis,pietroalbini
[stable] Fix CVE-2021-42574 and prepare Rust 1.56.1 This PR implements new lints to mitigate the impact of [CVE-2021-42574], caused by the presence of bidirectional-override Unicode codepoints in the compiled source code. [See the advisory][advisory] for more information about the vulnerability. The changes in this PR will be released later today as part of Rust 1.56.1. [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574 [advisory]: https://blog.rust-lang.org/2021/11/01/cve-2021-42574.html
2 parents 09c42c4 + 6552f7a commit 59eed8a

File tree

15 files changed

+545
-11
lines changed

15 files changed

+545
-11
lines changed

Cargo.lock

+1
Original file line numberDiff line numberDiff line change
@@ -4101,6 +4101,7 @@ dependencies = [
41014101
"rustc_span",
41024102
"tracing",
41034103
"unicode-normalization",
4104+
"unicode-width",
41044105
]
41054106

41064107
[[package]]

RELEASES.md

+8
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
Version 1.56.1 (2021-11-01)
2+
===========================
3+
4+
- New lints to detect the presence of bidirectional-override Unicode
5+
codepoints in the compiled source code ([CVE-2021-42574])
6+
7+
[CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574
8+
19
Version 1.56.0 (2021-10-21)
210
========================
311

compiler/rustc_errors/src/emitter.rs

+19-1
Original file line numberDiff line numberDiff line change
@@ -2054,8 +2054,26 @@ fn num_decimal_digits(num: usize) -> usize {
20542054
MAX_DIGITS
20552055
}
20562056

2057+
// We replace some characters so the CLI output is always consistent and underlines aligned.
2058+
const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
2059+
('\t', " "), // We do our own tab replacement
2060+
('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
2061+
('\u{202B}', ""), // supported accross CLIs and can cause confusion due to the bytes on disk
2062+
('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
2063+
('\u{202E}', ""),
2064+
('\u{2066}', ""),
2065+
('\u{2067}', ""),
2066+
('\u{2068}', ""),
2067+
('\u{202C}', ""),
2068+
('\u{2069}', ""),
2069+
];
2070+
20572071
fn replace_tabs(str: &str) -> String {
2058-
str.replace('\t', " ")
2072+
let mut s = str.to_string();
2073+
for (c, replacement) in OUTPUT_REPLACEMENTS {
2074+
s = s.replace(*c, replacement);
2075+
}
2076+
s
20592077
}
20602078

20612079
fn draw_col_separator(buffer: &mut StyledBuffer, line: usize, col: usize) {

compiler/rustc_lint/src/context.rs

+38-1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
1717
use self::TargetLint::*;
1818

19+
use crate::hidden_unicode_codepoints::UNICODE_TEXT_FLOW_CHARS;
1920
use crate::levels::{is_known_lint_tool, LintLevelsBuilder};
2021
use crate::passes::{EarlyLintPassObject, LateLintPassObject};
2122
use rustc_ast as ast;
@@ -40,7 +41,7 @@ use rustc_session::lint::{FutureIncompatibleInfo, Level, Lint, LintBuffer, LintI
4041
use rustc_session::Session;
4142
use rustc_session::SessionLintStore;
4243
use rustc_span::lev_distance::find_best_match_for_name;
43-
use rustc_span::{symbol::Symbol, MultiSpan, Span, DUMMY_SP};
44+
use rustc_span::{symbol::Symbol, BytePos, MultiSpan, Span, DUMMY_SP};
4445
use rustc_target::abi::{self, LayoutOf};
4546
use tracing::debug;
4647

@@ -612,6 +613,42 @@ pub trait LintContext: Sized {
612613
// Now, set up surrounding context.
613614
let sess = self.sess();
614615
match diagnostic {
616+
BuiltinLintDiagnostics::UnicodeTextFlow(span, content) => {
617+
let spans: Vec<_> = content
618+
.char_indices()
619+
.filter_map(|(i, c)| {
620+
UNICODE_TEXT_FLOW_CHARS.contains(&c).then(|| {
621+
let lo = span.lo() + BytePos(2 + i as u32);
622+
(c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
623+
})
624+
})
625+
.collect();
626+
let (an, s) = match spans.len() {
627+
1 => ("an ", ""),
628+
_ => ("", "s"),
629+
};
630+
db.span_label(span, &format!(
631+
"this comment contains {}invisible unicode text flow control codepoint{}",
632+
an,
633+
s,
634+
));
635+
for (c, span) in &spans {
636+
db.span_label(*span, format!("{:?}", c));
637+
}
638+
db.note(
639+
"these kind of unicode codepoints change the way text flows on \
640+
applications that support them, but can cause confusion because they \
641+
change the order of characters on the screen",
642+
);
643+
if !spans.is_empty() {
644+
db.multipart_suggestion_with_style(
645+
"if their presence wasn't intentional, you can remove them",
646+
spans.into_iter().map(|(_, span)| (span, "".to_string())).collect(),
647+
Applicability::MachineApplicable,
648+
SuggestionStyle::HideCodeAlways,
649+
);
650+
}
651+
},
615652
BuiltinLintDiagnostics::Normal => (),
616653
BuiltinLintDiagnostics::BareTraitObject(span, is_global) => {
617654
let (sugg, app) = match sess.source_map().span_to_snippet(span) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
use crate::{EarlyContext, EarlyLintPass, LintContext};
2+
use rustc_ast as ast;
3+
use rustc_errors::{Applicability, SuggestionStyle};
4+
use rustc_span::{BytePos, Span, Symbol};
5+
6+
declare_lint! {
7+
/// The `text_direction_codepoint_in_literal` lint detects Unicode codepoints that change the
8+
/// visual representation of text on screen in a way that does not correspond to their on
9+
/// memory representation.
10+
///
11+
/// ### Explanation
12+
///
13+
/// The unicode characters `\u{202A}`, `\u{202B}`, `\u{202D}`, `\u{202E}`, `\u{2066}`,
14+
/// `\u{2067}`, `\u{2068}`, `\u{202C}` and `\u{2069}` make the flow of text on screen change
15+
/// its direction on software that supports these codepoints. This makes the text "abc" display
16+
/// as "cba" on screen. By leveraging software that supports these, people can write specially
17+
/// crafted literals that make the surrounding code seem like it's performing one action, when
18+
/// in reality it is performing another. Because of this, we proactively lint against their
19+
/// presence to avoid surprises.
20+
///
21+
/// ### Example
22+
///
23+
/// ```rust,compile_fail
24+
/// #![deny(text_direction_codepoint_in_literal)]
25+
/// fn main() {
26+
/// println!("{:?}", '‮');
27+
/// }
28+
/// ```
29+
///
30+
/// {{produces}}
31+
///
32+
pub TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
33+
Deny,
34+
"detect special Unicode codepoints that affect the visual representation of text on screen, \
35+
changing the direction in which text flows",
36+
}
37+
38+
declare_lint_pass!(HiddenUnicodeCodepoints => [TEXT_DIRECTION_CODEPOINT_IN_LITERAL]);
39+
40+
crate const UNICODE_TEXT_FLOW_CHARS: &[char] = &[
41+
'\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{202C}',
42+
'\u{2069}',
43+
];
44+
45+
impl HiddenUnicodeCodepoints {
46+
fn lint_text_direction_codepoint(
47+
&self,
48+
cx: &EarlyContext<'_>,
49+
text: Symbol,
50+
span: Span,
51+
padding: u32,
52+
point_at_inner_spans: bool,
53+
label: &str,
54+
) {
55+
// Obtain the `Span`s for each of the forbidden chars.
56+
let spans: Vec<_> = text
57+
.as_str()
58+
.char_indices()
59+
.filter_map(|(i, c)| {
60+
UNICODE_TEXT_FLOW_CHARS.contains(&c).then(|| {
61+
let lo = span.lo() + BytePos(i as u32 + padding);
62+
(c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
63+
})
64+
})
65+
.collect();
66+
67+
cx.struct_span_lint(TEXT_DIRECTION_CODEPOINT_IN_LITERAL, span, |lint| {
68+
let mut err = lint.build(&format!(
69+
"unicode codepoint changing visible direction of text present in {}",
70+
label
71+
));
72+
let (an, s) = match spans.len() {
73+
1 => ("an ", ""),
74+
_ => ("", "s"),
75+
};
76+
err.span_label(
77+
span,
78+
&format!(
79+
"this {} contains {}invisible unicode text flow control codepoint{}",
80+
label, an, s,
81+
),
82+
);
83+
if point_at_inner_spans {
84+
for (c, span) in &spans {
85+
err.span_label(*span, format!("{:?}", c));
86+
}
87+
}
88+
err.note(
89+
"these kind of unicode codepoints change the way text flows on applications that \
90+
support them, but can cause confusion because they change the order of \
91+
characters on the screen",
92+
);
93+
if point_at_inner_spans && !spans.is_empty() {
94+
err.multipart_suggestion_with_style(
95+
"if their presence wasn't intentional, you can remove them",
96+
spans.iter().map(|(_, span)| (*span, "".to_string())).collect(),
97+
Applicability::MachineApplicable,
98+
SuggestionStyle::HideCodeAlways,
99+
);
100+
err.multipart_suggestion(
101+
"if you want to keep them but make them visible in your source code, you can \
102+
escape them",
103+
spans
104+
.into_iter()
105+
.map(|(c, span)| {
106+
let c = format!("{:?}", c);
107+
(span, c[1..c.len() - 1].to_string())
108+
})
109+
.collect(),
110+
Applicability::MachineApplicable,
111+
);
112+
} else {
113+
// FIXME: in other suggestions we've reversed the inner spans of doc comments. We
114+
// should do the same here to provide the same good suggestions as we do for
115+
// literals above.
116+
err.note("if their presence wasn't intentional, you can remove them");
117+
err.note(&format!(
118+
"if you want to keep them but make them visible in your source code, you can \
119+
escape them: {}",
120+
spans
121+
.into_iter()
122+
.map(|(c, _)| { format!("{:?}", c) })
123+
.collect::<Vec<String>>()
124+
.join(", "),
125+
));
126+
}
127+
err.emit();
128+
});
129+
}
130+
}
131+
impl EarlyLintPass for HiddenUnicodeCodepoints {
132+
fn check_attribute(&mut self, cx: &EarlyContext<'_>, attr: &ast::Attribute) {
133+
if let ast::AttrKind::DocComment(_, comment) = attr.kind {
134+
if comment.as_str().contains(UNICODE_TEXT_FLOW_CHARS) {
135+
self.lint_text_direction_codepoint(cx, comment, attr.span, 0, false, "doc comment");
136+
}
137+
}
138+
}
139+
140+
fn check_expr(&mut self, cx: &EarlyContext<'_>, expr: &ast::Expr) {
141+
// byte strings are already handled well enough by `EscapeError::NonAsciiCharInByteString`
142+
let (text, span, padding) = match &expr.kind {
143+
ast::ExprKind::Lit(ast::Lit { token, kind, span }) => {
144+
let text = token.symbol;
145+
if !text.as_str().contains(UNICODE_TEXT_FLOW_CHARS) {
146+
return;
147+
}
148+
let padding = match kind {
149+
// account for `"` or `'`
150+
ast::LitKind::Str(_, ast::StrStyle::Cooked) | ast::LitKind::Char(_) => 1,
151+
// account for `r###"`
152+
ast::LitKind::Str(_, ast::StrStyle::Raw(val)) => *val as u32 + 2,
153+
_ => return,
154+
};
155+
(text, span, padding)
156+
}
157+
_ => return,
158+
};
159+
self.lint_text_direction_codepoint(cx, text, *span, padding, true, "literal");
160+
}
161+
}

compiler/rustc_lint/src/lib.rs

+3
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ mod array_into_iter;
4848
pub mod builtin;
4949
mod context;
5050
mod early;
51+
pub mod hidden_unicode_codepoints;
5152
mod internal;
5253
mod late;
5354
mod levels;
@@ -77,6 +78,7 @@ use rustc_span::Span;
7778

7879
use array_into_iter::ArrayIntoIter;
7980
use builtin::*;
81+
use hidden_unicode_codepoints::*;
8082
use internal::*;
8183
use methods::*;
8284
use non_ascii_idents::*;
@@ -128,6 +130,7 @@ macro_rules! early_lint_passes {
128130
DeprecatedAttr: DeprecatedAttr::new(),
129131
WhileTrue: WhileTrue,
130132
NonAsciiIdents: NonAsciiIdents,
133+
HiddenUnicodeCodepoints: HiddenUnicodeCodepoints,
131134
IncompleteFeatures: IncompleteFeatures,
132135
RedundantSemicolons: RedundantSemicolons,
133136
UnusedDocComment: UnusedDocComment,

compiler/rustc_lint_defs/src/builtin.rs

+28
Original file line numberDiff line numberDiff line change
@@ -3416,3 +3416,31 @@ declare_lint! {
34163416
Warn,
34173417
"`break` expression with label and unlabeled loop as value expression"
34183418
}
3419+
3420+
declare_lint! {
3421+
/// The `text_direction_codepoint_in_comment` lint detects Unicode codepoints in comments that
3422+
/// change the visual representation of text on screen in a way that does not correspond to
3423+
/// their on memory representation.
3424+
///
3425+
/// ### Example
3426+
///
3427+
/// ```rust,compile_fail
3428+
/// #![deny(text_direction_codepoint_in_comment)]
3429+
/// fn main() {
3430+
/// println!("{:?}"); // '‮');
3431+
/// }
3432+
/// ```
3433+
///
3434+
/// {{produces}}
3435+
///
3436+
/// ### Explanation
3437+
///
3438+
/// Unicode allows changing the visual flow of text on screen in order to support scripts that
3439+
/// are written right-to-left, but a specially crafted comment can make code that will be
3440+
/// compiled appear to be part of a comment, depending on the software used to read the code.
3441+
/// To avoid potential problems or confusion, such as in CVE-2021-42574, by default we deny
3442+
/// their use.
3443+
pub TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
3444+
Deny,
3445+
"invisible directionality-changing codepoints in comment"
3446+
}

compiler/rustc_lint_defs/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,7 @@ pub enum BuiltinLintDiagnostics {
306306
TrailingMacro(bool, Ident),
307307
BreakWithLabelAndLoop(Span),
308308
NamedAsmLabel(String),
309+
UnicodeTextFlow(Span, String),
309310
}
310311

311312
/// Lints that are buffered up early on in the `Session` before the

compiler/rustc_parse/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,4 @@ rustc_session = { path = "../rustc_session" }
1818
rustc_span = { path = "../rustc_span" }
1919
rustc_ast = { path = "../rustc_ast" }
2020
unicode-normalization = "0.1.11"
21+
unicode-width = "0.1.4"

0 commit comments

Comments
 (0)