2
1

0009-QXmlStreamReader-Raise-error-on-unexpected-tokens.patch 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. From dc24f76cc621d56d11abb015654c427f7a7c0a31 Mon Sep 17 00:00:00 2001
  2. From: Axel Spoerl <axel.spoerl@qt.io>
  3. Date: Fri, 30 Jun 2023 12:43:59 +0200
  4. Subject: [PATCH] QXmlStreamReader: Raise error on unexpected tokens
  5. QXmlStreamReader accepted multiple DOCTYPE elements, containing DTD
  6. fragments in the XML prolog, and in the XML body.
  7. Well-formed but invalid XML files - with multiple DTD fragments in
  8. prolog and body, combined with recursive entity expansions - have
  9. caused infinite loops in QXmlStreamReader.
  10. This patch implements a token check in QXmlStreamReader.
  11. A stream is allowed to start with an XML prolog. StartDocument
  12. and DOCTYPE elements are only allowed in this prolog, which
  13. may also contain ProcessingInstruction and Comment elements.
  14. As soon as anything else is seen, the prolog ends.
  15. After that, the prolog-specific elements are treated as unexpected.
  16. Furthermore, the prolog can contain at most one DOCTYPE element.
  17. Update the documentation to reflect the new behavior.
  18. Add an autotest that checks the new error cases are correctly detected,
  19. and no error is raised for legitimate input.
  20. The original OSS-Fuzz files (see bug reports) are not included in this
  21. patch for file size reasons. They have been tested manually. Each of
  22. them has more than one DOCTYPE element, causing infinite loops in
  23. recursive entity expansions. The newly implemented functionality
  24. detects those invalid DTD fragments. By raising an error, it aborts
  25. stream reading before an infinite loop occurs.
  26. Thanks to OSS-Fuzz for finding this.
  27. Fixes: QTBUG-92113
  28. Fixes: QTBUG-95188
  29. Pick-to: 6.6 6.5 6.2 5.15
  30. Change-Id: I0a082b9188b2eee50b396c4d5b1c9e1fd237bbdd
  31. Reviewed-by: Volker Hilsheimer <volker.hilsheimer@qt.io>
  32. Fixes: https://security-tracker.debian.org/tracker/CVE-2023-38197
  33. Upstream: https://github.com/qt/qtbase/commit/c4301be7d5f94852e1b17f2c2989d5ca807855d4
  34. [Thomas: minor conflicts resolved when backporting]
  35. Signed-off-by: Thomas Petazzoni <thomas.petazzoni@bootlin.com>
  36. ---
  37. src/corelib/serialization/qxmlstream.cpp | 145 +++++++++++++++++-
  38. src/corelib/serialization/qxmlstream_p.h | 11 ++
  39. .../qxmlstream/tokenError/dtdInBody.xml | 20 +++
  40. .../qxmlstream/tokenError/multipleDtd.xml | 20 +++
  41. .../qxmlstream/tokenError/wellFormed.xml | 15 ++
  42. .../qxmlstream/tst_qxmlstream.cpp | 39 +++++
  43. 6 files changed, 242 insertions(+), 8 deletions(-)
  44. create mode 100644 tests/auto/corelib/serialization/qxmlstream/tokenError/dtdInBody.xml
  45. create mode 100644 tests/auto/corelib/serialization/qxmlstream/tokenError/multipleDtd.xml
  46. create mode 100644 tests/auto/corelib/serialization/qxmlstream/tokenError/wellFormed.xml
  47. diff --git a/src/corelib/serialization/qxmlstream.cpp b/src/corelib/serialization/qxmlstream.cpp
  48. index a31bd16f1ce..ac5b291c278 100644
  49. --- a/src/corelib/serialization/qxmlstream.cpp
  50. +++ b/src/corelib/serialization/qxmlstream.cpp
  51. @@ -128,7 +128,7 @@ void reversed(const Range &&) = delete;
  52. addData() or by waiting for it to arrive on the device().
  53. \value UnexpectedElementError The parser encountered an element
  54. - that was different to those it expected.
  55. + or token that was different to those it expected.
  56. */
  57. @@ -265,13 +265,34 @@ QXmlStreamEntityResolver *QXmlStreamReader::entityResolver() const
  58. QXmlStreamReader is a well-formed XML 1.0 parser that does \e not
  59. include external parsed entities. As long as no error occurs, the
  60. - application code can thus be assured that the data provided by the
  61. - stream reader satisfies the W3C's criteria for well-formed XML. For
  62. - example, you can be certain that all tags are indeed nested and
  63. - closed properly, that references to internal entities have been
  64. - replaced with the correct replacement text, and that attributes have
  65. - been normalized or added according to the internal subset of the
  66. - DTD.
  67. + application code can thus be assured, that
  68. + \list
  69. + \li the data provided by the stream reader satisfies the W3C's
  70. + criteria for well-formed XML,
  71. + \li tokens are provided in a valid order.
  72. + \endlist
  73. +
  74. + Unless QXmlStreamReader raises an error, it guarantees the following:
  75. + \list
  76. + \li All tags are nested and closed properly.
  77. + \li References to internal entities have been replaced with the
  78. + correct replacement text.
  79. + \li Attributes have been normalized or added according to the
  80. + internal subset of the \l DTD.
  81. + \li Tokens of type \l StartDocument happen before all others,
  82. + aside from comments and processing instructions.
  83. + \li At most one DOCTYPE element (a token of type \l DTD) is present.
  84. + \li If present, the DOCTYPE appears before all other elements,
  85. + aside from StartDocument, comments and processing instructions.
  86. + \endlist
  87. +
  88. + In particular, once any token of type \l StartElement, \l EndElement,
  89. + \l Characters, \l EntityReference or \l EndDocument is seen, no
  90. + tokens of type StartDocument or DTD will be seen. If one is present in
  91. + the input stream, out of order, an error is raised.
  92. +
  93. + \note The token types \l Comment and \l ProcessingInstruction may appear
  94. + anywhere in the stream.
  95. If an error occurs while parsing, atEnd() and hasError() return
  96. true, and error() returns the error that occurred. The functions
  97. @@ -574,6 +595,7 @@ QXmlStreamReader::TokenType QXmlStreamReader::readNext()
  98. d->token = -1;
  99. return readNext();
  100. }
  101. + d->checkToken();
  102. return d->type;
  103. }
  104. @@ -658,6 +680,11 @@ static constexpr auto QXmlStreamReader_tokenTypeString = qOffsetStringArray(
  105. "ProcessingInstruction"
  106. );
  107. +static constexpr auto QXmlStreamReader_XmlContextString = qOffsetStringArray(
  108. + "Prolog",
  109. + "Body"
  110. +);
  111. +
  112. /*!
  113. \property QXmlStreamReader::namespaceProcessing
  114. \brief the namespace-processing flag of the stream reader.
  115. @@ -692,6 +719,14 @@ QString QXmlStreamReader::tokenString() const
  116. return QLatin1StringView(QXmlStreamReader_tokenTypeString.at(d->type));
  117. }
  118. +/*!
  119. + \internal
  120. + \return \param loc (Prolog/Body) as a string.
  121. + */
  122. +static constexpr QLatin1StringView contextString(QXmlStreamReaderPrivate::XmlContext ctxt)
  123. +{
  124. + return QLatin1StringView(QXmlStreamReader_XmlContextString.at(static_cast<int>(ctxt)));
  125. +}
  126. #endif // QT_NO_XMLSTREAMREADER
  127. QXmlStreamPrivateTagStack::QXmlStreamPrivateTagStack()
  128. @@ -778,6 +813,8 @@ void QXmlStreamReaderPrivate::init()
  129. type = QXmlStreamReader::NoToken;
  130. error = QXmlStreamReader::NoError;
  131. + currentContext = XmlContext::Prolog;
  132. + foundDTD = false;
  133. }
  134. /*
  135. @@ -3684,6 +3721,98 @@ void QXmlStreamWriter::writeCurrentToken(const QXmlStreamReader &reader)
  136. }
  137. }
  138. +static constexpr bool isTokenAllowedInContext(QXmlStreamReader::TokenType type,
  139. + QXmlStreamReaderPrivate::XmlContext loc)
  140. +{
  141. + switch (type) {
  142. + case QXmlStreamReader::StartDocument:
  143. + case QXmlStreamReader::DTD:
  144. + return loc == QXmlStreamReaderPrivate::XmlContext::Prolog;
  145. +
  146. + case QXmlStreamReader::StartElement:
  147. + case QXmlStreamReader::EndElement:
  148. + case QXmlStreamReader::Characters:
  149. + case QXmlStreamReader::EntityReference:
  150. + case QXmlStreamReader::EndDocument:
  151. + return loc == QXmlStreamReaderPrivate::XmlContext::Body;
  152. +
  153. + case QXmlStreamReader::Comment:
  154. + case QXmlStreamReader::ProcessingInstruction:
  155. + return true;
  156. +
  157. + case QXmlStreamReader::NoToken:
  158. + case QXmlStreamReader::Invalid:
  159. + return false;
  160. + }
  161. +
  162. + // GCC 8.x does not treat __builtin_unreachable() as constexpr
  163. +#if !defined(Q_CC_GNU_ONLY) || (Q_CC_GNU >= 900)
  164. + Q_UNREACHABLE();
  165. + return false;
  166. +#else
  167. + return false;
  168. +#endif
  169. +}
  170. +
  171. +/*!
  172. + \internal
  173. + \brief QXmlStreamReader::isValidToken
  174. + \return \c true if \param type is a valid token type.
  175. + \return \c false if \param type is an unexpected token,
  176. + which indicates a non-well-formed or invalid XML stream.
  177. + */
  178. +bool QXmlStreamReaderPrivate::isValidToken(QXmlStreamReader::TokenType type)
  179. +{
  180. + // Don't change currentContext, if Invalid or NoToken occur in the prolog
  181. + if (type == QXmlStreamReader::Invalid || type == QXmlStreamReader::NoToken)
  182. + return false;
  183. +
  184. + // If a token type gets rejected in the body, there is no recovery
  185. + const bool result = isTokenAllowedInContext(type, currentContext);
  186. + if (result || currentContext == XmlContext::Body)
  187. + return result;
  188. +
  189. + // First non-Prolog token observed => switch context to body and check again.
  190. + currentContext = XmlContext::Body;
  191. + return isTokenAllowedInContext(type, currentContext);
  192. +}
  193. +
  194. +/*!
  195. + \internal
  196. + Checks token type and raises an error, if it is invalid
  197. + in the current context (prolog/body).
  198. + */
  199. +void QXmlStreamReaderPrivate::checkToken()
  200. +{
  201. + Q_Q(QXmlStreamReader);
  202. +
  203. + // The token type must be consumed, to keep track if the body has been reached.
  204. + const XmlContext context = currentContext;
  205. + const bool ok = isValidToken(type);
  206. +
  207. + // Do nothing if an error has been raised already (going along with an unexpected token)
  208. + if (error != QXmlStreamReader::Error::NoError)
  209. + return;
  210. +
  211. + if (!ok) {
  212. + raiseError(QXmlStreamReader::UnexpectedElementError,
  213. + QObject::tr("Unexpected token type %1 in %2.")
  214. + .arg(q->tokenString(), contextString(context)));
  215. + return;
  216. + }
  217. +
  218. + if (type != QXmlStreamReader::DTD)
  219. + return;
  220. +
  221. + // Raise error on multiple DTD tokens
  222. + if (foundDTD) {
  223. + raiseError(QXmlStreamReader::UnexpectedElementError,
  224. + QObject::tr("Found second DTD token in %1.").arg(contextString(context)));
  225. + } else {
  226. + foundDTD = true;
  227. + }
  228. +}
  229. +
  230. /*!
  231. \fn bool QXmlStreamAttributes::hasAttribute(const QString &qualifiedName) const
  232. \since 4.5
  233. diff --git a/src/corelib/serialization/qxmlstream_p.h b/src/corelib/serialization/qxmlstream_p.h
  234. index 1fd69a2c1f8..f5059f8fcf9 100644
  235. --- a/src/corelib/serialization/qxmlstream_p.h
  236. +++ b/src/corelib/serialization/qxmlstream_p.h
  237. @@ -270,6 +270,17 @@ public:
  238. QStringDecoder decoder;
  239. bool atEnd;
  240. + enum class XmlContext
  241. + {
  242. + Prolog,
  243. + Body,
  244. + };
  245. +
  246. + XmlContext currentContext = XmlContext::Prolog;
  247. + bool foundDTD = false;
  248. + bool isValidToken(QXmlStreamReader::TokenType type);
  249. + void checkToken();
  250. +
  251. /*!
  252. \sa setType()
  253. */
  254. diff --git a/tests/auto/corelib/serialization/qxmlstream/tokenError/dtdInBody.xml b/tests/auto/corelib/serialization/qxmlstream/tokenError/dtdInBody.xml
  255. new file mode 100644
  256. index 00000000000..1c3ca4e2711
  257. --- /dev/null
  258. +++ b/tests/auto/corelib/serialization/qxmlstream/tokenError/dtdInBody.xml
  259. @@ -0,0 +1,20 @@
  260. +<!DOCTYPE TEST [
  261. + <!ELEMENT TESTATTRIBUTE (CASE+)>
  262. + <!ELEMENT CASE (CLASS, FUNCTION)>
  263. + <!ELEMENT CLASS (#PCDATA)>
  264. +
  265. + <!-- adding random ENTITY statement, as this is typical DTD content -->
  266. + <!ENTITY unite "&#x222a;">
  267. +
  268. + <!ATTLIST CASE CLASS CDATA #REQUIRED>
  269. +]>
  270. +<TEST>
  271. + <CASE>
  272. + <CLASS>tst_QXmlStream</CLASS>
  273. + </CASE>
  274. + <!-- invalid DTD in XML body follows -->
  275. + <!DOCTYPE DTDTEST [
  276. + <!ELEMENT RESULT (CASE+)>
  277. + <!ATTLIST RESULT OUTPUT CDATA #REQUIRED>
  278. + ]>
  279. +</TEST>
  280. diff --git a/tests/auto/corelib/serialization/qxmlstream/tokenError/multipleDtd.xml b/tests/auto/corelib/serialization/qxmlstream/tokenError/multipleDtd.xml
  281. new file mode 100644
  282. index 00000000000..cd398c0f9fd
  283. --- /dev/null
  284. +++ b/tests/auto/corelib/serialization/qxmlstream/tokenError/multipleDtd.xml
  285. @@ -0,0 +1,20 @@
  286. +<!DOCTYPE TEST [
  287. + <!ELEMENT TESTATTRIBUTE (CASE+)>
  288. + <!ELEMENT CASE (CLASS, FUNCTION, DATASET, COMMENTS)>
  289. + <!ELEMENT CLASS (#PCDATA)>
  290. +
  291. + <!-- adding random ENTITY statements, as this is typical DTD content -->
  292. + <!ENTITY iff "&hArr;">
  293. +
  294. + <!ATTLIST CASE CLASS CDATA #REQUIRED>
  295. +]>
  296. +<!-- invalid second DTD follows -->
  297. +<!DOCTYPE SECOND [
  298. + <!ELEMENT SECONDATTRIBUTE (#PCDATA)>
  299. + <!ENTITY on "&#8728;">
  300. +]>
  301. +<TEST>
  302. + <CASE>
  303. + <CLASS>tst_QXmlStream</CLASS>
  304. + </CASE>
  305. +</TEST>
  306. diff --git a/tests/auto/corelib/serialization/qxmlstream/tokenError/wellFormed.xml b/tests/auto/corelib/serialization/qxmlstream/tokenError/wellFormed.xml
  307. new file mode 100644
  308. index 00000000000..1b61a3f0622
  309. --- /dev/null
  310. +++ b/tests/auto/corelib/serialization/qxmlstream/tokenError/wellFormed.xml
  311. @@ -0,0 +1,15 @@
  312. +<!DOCTYPE TEST [
  313. + <!ELEMENT TESTATTRIBUTE (CASE+)>
  314. + <!ELEMENT CASE (CLASS, FUNCTION, DATASET, COMMENTS)>
  315. + <!ELEMENT CLASS (#PCDATA)>
  316. +
  317. + <!-- adding random ENTITY statements, as this is typical DTD content -->
  318. + <!ENTITY unite "&#x222a;">
  319. +
  320. + <!ATTLIST CASE CLASS CDATA #REQUIRED>
  321. +]>
  322. +<TEST>
  323. + <CASE>
  324. + <CLASS>tst_QXmlStream</CLASS>
  325. + </CASE>
  326. +</TEST>
  327. diff --git a/tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp b/tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp
  328. index c64088d4774..8d2738700ea 100644
  329. --- a/tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp
  330. +++ b/tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp
  331. @@ -584,6 +584,9 @@ private slots:
  332. void entityExpansionLimit() const;
  333. + void tokenErrorHandling_data() const;
  334. + void tokenErrorHandling() const;
  335. +
  336. private:
  337. static QByteArray readFile(const QString &filename);
  338. @@ -1754,5 +1757,41 @@ void tst_QXmlStream::roundTrip() const
  339. QCOMPARE(out, in);
  340. }
  341. +void tst_QXmlStream::tokenErrorHandling_data() const
  342. +{
  343. + QTest::addColumn<QString>("fileName");
  344. + QTest::addColumn<QXmlStreamReader::Error>("expectedError");
  345. + QTest::addColumn<QString>("errorKeyWord");
  346. +
  347. + constexpr auto invalid = QXmlStreamReader::Error::UnexpectedElementError;
  348. + constexpr auto valid = QXmlStreamReader::Error::NoError;
  349. + QTest::newRow("DtdInBody") << "dtdInBody.xml" << invalid << "DTD";
  350. + QTest::newRow("multipleDTD") << "multipleDtd.xml" << invalid << "second DTD";
  351. + QTest::newRow("wellFormed") << "wellFormed.xml" << valid << "";
  352. +}
  353. +
  354. +void tst_QXmlStream::tokenErrorHandling() const
  355. +{
  356. + QFETCH(const QString, fileName);
  357. + QFETCH(const QXmlStreamReader::Error, expectedError);
  358. + QFETCH(const QString, errorKeyWord);
  359. +
  360. + const QDir dir(QFINDTESTDATA("tokenError"));
  361. + QFile file(dir.absoluteFilePath(fileName));
  362. +
  363. + // Cross-compiling: File will be on host only
  364. + if (!file.exists())
  365. + QSKIP("Testfile not found.");
  366. +
  367. + file.open(QIODevice::ReadOnly);
  368. + QXmlStreamReader reader(&file);
  369. + while (!reader.atEnd())
  370. + reader.readNext();
  371. +
  372. + QCOMPARE(reader.error(), expectedError);
  373. + if (expectedError != QXmlStreamReader::Error::NoError)
  374. + QVERIFY(reader.errorString().contains(errorKeyWord));
  375. +}
  376. +
  377. #include "tst_qxmlstream.moc"
  378. // vim: et:ts=4:sw=4:sts=4
  379. --
  380. 2.46.0