#![no_std]
#![doc(html_root_url = "https://docs.rs/xmlparser/0.13.1")]
#![forbid(unsafe_code)]
#![warn(missing_docs)]
#![allow(ellipsis_inclusive_range_patterns)]
#[cfg(feature = "std")]
#[macro_use]
extern crate std;
macro_rules! matches {
($expression:expr, $($pattern:tt)+) => {
match $expression {
$($pattern)+ => true,
_ => false
}
}
}
mod error;
mod stream;
mod strspan;
mod xmlchar;
pub use crate::error::*;
pub use crate::stream::*;
pub use crate::strspan::*;
pub use crate::xmlchar::*;
#[allow(missing_docs)]
#[derive(Clone, Copy, PartialEq, Debug)]
pub enum Token<'a> {
Declaration {
version: StrSpan<'a>,
encoding: Option<StrSpan<'a>>,
standalone: Option<bool>,
span: StrSpan<'a>,
},
ProcessingInstruction {
target: StrSpan<'a>,
content: Option<StrSpan<'a>>,
span: StrSpan<'a>,
},
Comment {
text: StrSpan<'a>,
span: StrSpan<'a>,
},
DtdStart {
name: StrSpan<'a>,
external_id: Option<ExternalId<'a>>,
span: StrSpan<'a>,
},
EmptyDtd {
name: StrSpan<'a>,
external_id: Option<ExternalId<'a>>,
span: StrSpan<'a>,
},
EntityDeclaration {
name: StrSpan<'a>,
definition: EntityDefinition<'a>,
span: StrSpan<'a>,
},
DtdEnd {
span: StrSpan<'a>,
},
ElementStart {
prefix: StrSpan<'a>,
local: StrSpan<'a>,
span: StrSpan<'a>,
},
Attribute {
prefix: StrSpan<'a>,
local: StrSpan<'a>,
value: StrSpan<'a>,
span: StrSpan<'a>,
},
ElementEnd {
end: ElementEnd<'a>,
span: StrSpan<'a>,
},
Text {
text: StrSpan<'a>,
},
Cdata {
text: StrSpan<'a>,
span: StrSpan<'a>,
},
}
#[derive(Clone, Copy, PartialEq, Debug)]
pub enum ElementEnd<'a> {
Open,
Close(StrSpan<'a>, StrSpan<'a>),
Empty,
}
#[allow(missing_docs)]
#[derive(Clone, Copy, PartialEq, Debug)]
pub enum ExternalId<'a> {
System(StrSpan<'a>),
Public(StrSpan<'a>, StrSpan<'a>),
}
#[allow(missing_docs)]
#[derive(Clone, Copy, PartialEq, Debug)]
pub enum EntityDefinition<'a> {
EntityValue(StrSpan<'a>),
ExternalId(ExternalId<'a>),
}
type Result<T> = core::result::Result<T, Error>;
type StreamResult<T> = core::result::Result<T, StreamError>;
#[derive(Clone, Copy, PartialEq)]
enum State {
Declaration,
AfterDeclaration,
Dtd,
AfterDtd,
Elements,
Attributes,
AfterElements,
End,
}
pub struct Tokenizer<'a> {
stream: Stream<'a>,
state: State,
depth: usize,
fragment_parsing: bool,
}
impl<'a> From<&'a str> for Tokenizer<'a> {
#[inline]
fn from(text: &'a str) -> Self {
let mut stream = Stream::from(text);
if stream.starts_with(&[0xEF, 0xBB, 0xBF]) {
stream.advance(3);
}
Tokenizer {
stream,
state: State::Declaration,
depth: 0,
fragment_parsing: false,
}
}
}
macro_rules! map_err_at {
($fun:expr, $stream:expr, $err:ident) => {{
let start = $stream.pos();
$fun.map_err(|e|
Error::$err(e, $stream.gen_text_pos_from(start))
)
}}
}
impl<'a> Tokenizer<'a> {
pub fn from_fragment(full_text: &'a str, fragment: core::ops::Range<usize>) -> Self {
Tokenizer {
stream: Stream::from_substr(full_text, fragment),
state: State::Elements,
depth: 0,
fragment_parsing: true,
}
}
fn parse_next_impl(&mut self) -> Option<Result<Token<'a>>> {
let s = &mut self.stream;
if s.at_end() {
return None;
}
let start = s.pos();
match self.state {
State::Declaration => {
self.state = State::AfterDeclaration;
if s.starts_with(b"<?xml ") {
Some(Self::parse_declaration(s))
} else {
self.parse_next_impl()
}
}
State::AfterDeclaration => {
if s.starts_with(b"<!DOCTYPE") {
let t = Self::parse_doctype(s);
match t {
Ok(Token::DtdStart { .. }) => self.state = State::Dtd,
Ok(Token::EmptyDtd { .. }) => self.state = State::AfterDtd,
_ => {}
}
Some(t)
} else if s.starts_with(b"<!--") {
Some(Self::parse_comment(s))
} else if s.starts_with_space() {
s.skip_spaces();
self.parse_next_impl()
} else {
self.state = State::AfterDtd;
self.parse_next_impl()
}
}
State::Dtd => {
if s.starts_with(b"<!ENTITY") {
Some(Self::parse_entity_decl(s))
} else if s.starts_with(b"<!--") {
Some(Self::parse_comment(s))
} else if s.starts_with(b"<?") {
if s.starts_with(b"<?xml ") {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
} else {
Some(Self::parse_pi(s))
}
} else if s.starts_with(b"]>") {
self.state = State::AfterDtd;
s.advance(2);
Some(Ok(Token::DtdEnd { span: s.slice_back(start) }))
} else if s.starts_with_space() {
s.skip_spaces();
self.parse_next_impl()
} else if s.starts_with(b"<!ELEMENT")
|| s.starts_with(b"<!ATTLIST")
|| s.starts_with(b"<!NOTATION")
{
if Self::consume_decl(s).is_err() {
let pos = s.gen_text_pos_from(start);
Some(Err(Error::UnknownToken(pos)))
} else {
self.parse_next_impl()
}
} else {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
}
}
State::AfterDtd => {
if s.starts_with(b"<!--") {
Some(Self::parse_comment(s))
} else if s.starts_with(b"<?") {
if s.starts_with(b"<?xml ") {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
} else {
Some(Self::parse_pi(s))
}
} else if s.starts_with(b"<!") {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
} else if s.starts_with(b"<") {
self.state = State::Attributes;
Some(Self::parse_element_start(s))
} else if s.starts_with_space() {
s.skip_spaces();
self.parse_next_impl()
} else {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
}
}
State::Elements => {
match s.curr_byte() {
Ok(b'<') => {
match s.next_byte() {
Ok(b'!') => {
if s.starts_with(b"<!--") {
Some(Self::parse_comment(s))
} else if s.starts_with(b"<![CDATA[") {
Some(Self::parse_cdata(s))
} else {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
}
}
Ok(b'?') => {
if !s.starts_with(b"<?xml ") {
Some(Self::parse_pi(s))
} else {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
}
}
Ok(b'/') => {
if self.depth > 0 {
self.depth -= 1;
}
if self.depth == 0 && !self.fragment_parsing {
self.state = State::AfterElements;
} else {
self.state = State::Elements;
}
Some(Self::parse_close_element(s))
}
Ok(_) => {
self.state = State::Attributes;
Some(Self::parse_element_start(s))
}
Err(_) => {
return Some(Err(Error::UnknownToken(s.gen_text_pos())));
}
}
}
Ok(_) => {
Some(Self::parse_text(s))
}
Err(_) => {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
}
}
}
State::Attributes => {
let t = Self::parse_attribute(s);
if let Ok(Token::ElementEnd { end, .. }) = t {
if end == ElementEnd::Open {
self.depth += 1;
}
if self.depth == 0 && !self.fragment_parsing {
self.state = State::AfterElements;
} else {
self.state = State::Elements;
}
}
Some(t.map_err(|e| Error::InvalidAttribute(e, s.gen_text_pos_from(start))))
}
State::AfterElements => {
if s.starts_with(b"<!--") {
Some(Self::parse_comment(s))
} else if s.starts_with(b"<?") {
if s.starts_with(b"<?xml ") {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
} else {
Some(Self::parse_pi(s))
}
} else if s.starts_with_space() {
s.skip_spaces();
self.parse_next_impl()
} else {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
}
}
State::End => {
None
}
}
}
fn parse_declaration(s: &mut Stream<'a>) -> Result<Token<'a>> {
map_err_at!(Self::parse_declaration_impl(s), s, InvalidDeclaration)
}
fn parse_declaration_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
fn consume_spaces(s: &mut Stream) -> StreamResult<()> {
if s.starts_with_space() {
s.skip_spaces();
} else if !s.starts_with(b"?>") && !s.at_end() {
return Err(StreamError::InvalidSpace(s.curr_byte_unchecked(), s.gen_text_pos()));
}
Ok(())
}
let start = s.pos();
s.advance(6);
let version = Self::parse_version_info(s)?;
consume_spaces(s)?;
let encoding = Self::parse_encoding_decl(s)?;
if encoding.is_some() {
consume_spaces(s)?;
}
let standalone = Self::parse_standalone(s)?;
s.skip_spaces();
s.skip_string(b"?>")?;
let span = s.slice_back(start);
Ok(Token::Declaration { version, encoding, standalone, span })
}
fn parse_version_info(s: &mut Stream<'a>) -> StreamResult<StrSpan<'a>> {
s.skip_spaces();
s.skip_string(b"version")?;
s.consume_eq()?;
let quote = s.consume_quote()?;
let start = s.pos();
s.skip_string(b"1.")?;
s.skip_bytes(|_, c| c.is_xml_digit());
let ver = s.slice_back(start);
s.consume_byte(quote)?;
Ok(ver)
}
fn parse_encoding_decl(s: &mut Stream<'a>) -> StreamResult<Option<StrSpan<'a>>> {
if !s.starts_with(b"encoding") {
return Ok(None);
}
s.advance(8);
s.consume_eq()?;
let quote = s.consume_quote()?;
let name = s.consume_bytes(|_, c| {
c.is_xml_letter()
|| c.is_xml_digit()
|| c == b'.'
|| c == b'-'
|| c == b'_'
});
s.consume_byte(quote)?;
Ok(Some(name))
}
fn parse_standalone(s: &mut Stream<'a>) -> StreamResult<Option<bool>> {
if !s.starts_with(b"standalone") {
return Ok(None);
}
s.advance(10);
s.consume_eq()?;
let quote = s.consume_quote()?;
let start = s.pos();
let value = s.consume_name()?.as_str();
let flag = match value {
"yes" => true,
"no" => false,
_ => {
let pos = s.gen_text_pos_from(start);
return Err(StreamError::InvalidString("yes', 'no", pos));
}
};
s.consume_byte(quote)?;
Ok(Some(flag))
}
fn parse_comment(s: &mut Stream<'a>) -> Result<Token<'a>> {
let start = s.pos();
Self::parse_comment_impl(s)
.map_err(|e| Error::InvalidComment(e, s.gen_text_pos_from(start)))
}
fn parse_comment_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let start = s.pos();
s.advance(4);
let text = s.consume_chars(|s, c| !(c == '-' && s.starts_with(b"-->")))?;
s.skip_string(b"-->")?;
if text.as_str().contains("--") {
return Err(StreamError::InvalidCommentData);
}
if text.as_str().ends_with('-') {
return Err(StreamError::InvalidCommentEnd);
}
let span = s.slice_back(start);
Ok(Token::Comment { text, span })
}
fn parse_pi(s: &mut Stream<'a>) -> Result<Token<'a>> {
map_err_at!(Self::parse_pi_impl(s), s, InvalidPI)
}
fn parse_pi_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let start = s.pos();
s.advance(2);
let target = s.consume_name()?;
s.skip_spaces();
let content = s.consume_chars(|s, c| !(c == '?' && s.starts_with(b"?>")))?;
let content = if !content.is_empty() {
Some(content)
} else {
None
};
s.skip_string(b"?>")?;
let span = s.slice_back(start);
Ok(Token::ProcessingInstruction { target, content, span })
}
fn parse_doctype(s: &mut Stream<'a>) -> Result<Token<'a>> {
map_err_at!(Self::parse_doctype_impl(s), s, InvalidDoctype)
}
fn parse_doctype_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let start = s.pos();
s.advance(9);
s.consume_spaces()?;
let name = s.consume_name()?;
s.skip_spaces();
let external_id = Self::parse_external_id(s)?;
s.skip_spaces();
let c = s.curr_byte()?;
if c != b'[' && c != b'>' {
static EXPECTED: &[u8] = &[b'[', b'>'];
return Err(StreamError::InvalidCharMultiple(c, EXPECTED, s.gen_text_pos()));
}
s.advance(1);
let span = s.slice_back(start);
if c == b'[' {
Ok(Token::DtdStart { name, external_id, span })
} else {
Ok(Token::EmptyDtd { name, external_id, span })
}
}
fn parse_external_id(s: &mut Stream<'a>) -> StreamResult<Option<ExternalId<'a>>> {
let v = if s.starts_with(b"SYSTEM") || s.starts_with(b"PUBLIC") {
let start = s.pos();
s.advance(6);
let id = s.slice_back(start);
s.consume_spaces()?;
let quote = s.consume_quote()?;
let literal1 = s.consume_bytes(|_, c| c != quote);
s.consume_byte(quote)?;
let v = if id.as_str() == "SYSTEM" {
ExternalId::System(literal1)
} else {
s.consume_spaces()?;
let quote = s.consume_quote()?;
let literal2 = s.consume_bytes(|_, c| c != quote);
s.consume_byte(quote)?;
ExternalId::Public(literal1, literal2)
};
Some(v)
} else {
None
};
Ok(v)
}
fn parse_entity_decl(s: &mut Stream<'a>) -> Result<Token<'a>> {
map_err_at!(Self::parse_entity_decl_impl(s), s, InvalidEntity)
}
fn parse_entity_decl_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let start = s.pos();
s.advance(8);
s.consume_spaces()?;
let is_ge = if s.try_consume_byte(b'%') {
s.consume_spaces()?;
false
} else {
true
};
let name = s.consume_name()?;
s.consume_spaces()?;
let definition = Self::parse_entity_def(s, is_ge)?;
s.skip_spaces();
s.consume_byte(b'>')?;
let span = s.slice_back(start);
Ok(Token::EntityDeclaration { name, definition, span })
}
fn parse_entity_def(s: &mut Stream<'a>, is_ge: bool) -> StreamResult<EntityDefinition<'a>> {
let c = s.curr_byte()?;
match c {
b'"' | b'\'' => {
let quote = s.consume_quote()?;
let value = s.consume_bytes(|_, c| c != quote);
s.consume_byte(quote)?;
Ok(EntityDefinition::EntityValue(value))
}
b'S' | b'P' => {
if let Some(id) = Self::parse_external_id(s)? {
if is_ge {
s.skip_spaces();
if s.starts_with(b"NDATA") {
s.advance(5);
s.consume_spaces()?;
s.skip_name()?;
}
}
Ok(EntityDefinition::ExternalId(id))
} else {
Err(StreamError::InvalidExternalID)
}
}
_ => {
static EXPECTED: &[u8] = &[b'"', b'\'', b'S', b'P'];
let pos = s.gen_text_pos();
Err(StreamError::InvalidCharMultiple(c, EXPECTED, pos))
}
}
}
fn consume_decl(s: &mut Stream) -> StreamResult<()> {
s.skip_bytes(|_, c| c != b'>');
s.consume_byte(b'>')?;
Ok(())
}
fn parse_cdata(s: &mut Stream<'a>) -> Result<Token<'a>> {
map_err_at!(Self::parse_cdata_impl(s), s, InvalidCdata)
}
fn parse_cdata_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let start = s.pos();
s.advance(9);
let text = s.consume_chars(|s, c| !(c == ']' && s.starts_with(b"]]>")))?;
s.skip_string(b"]]>")?;
let span = s.slice_back(start);
Ok(Token::Cdata { text, span })
}
fn parse_element_start(s: &mut Stream<'a>) -> Result<Token<'a>> {
map_err_at!(Self::parse_element_start_impl(s), s, InvalidElement)
}
fn parse_element_start_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let start = s.pos();
s.advance(1);
let (prefix, local) = s.consume_qname()?;
let span = s.slice_back(start);
Ok(Token::ElementStart { prefix, local, span })
}
fn parse_close_element(s: &mut Stream<'a>) -> Result<Token<'a>> {
map_err_at!(Self::parse_close_element_impl(s), s, InvalidElement)
}
fn parse_close_element_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let start = s.pos();
s.advance(2);
let (prefix, tag_name) = s.consume_qname()?;
s.skip_spaces();
s.consume_byte(b'>')?;
let span = s.slice_back(start);
Ok(Token::ElementEnd { end: ElementEnd::Close(prefix, tag_name), span })
}
fn parse_attribute(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let attr_start = s.pos();
let has_space = s.starts_with_space();
s.skip_spaces();
if let Ok(c) = s.curr_byte() {
let start = s.pos();
match c {
b'/' => {
s.advance(1);
s.consume_byte(b'>')?;
let span = s.slice_back(start);
return Ok(Token::ElementEnd { end: ElementEnd::Empty, span });
}
b'>' => {
s.advance(1);
let span = s.slice_back(start);
return Ok(Token::ElementEnd { end: ElementEnd::Open, span });
}
_ => {}
}
}
if !has_space {
if !s.at_end() {
return Err(StreamError::InvalidSpace(
s.curr_byte_unchecked(), s.gen_text_pos_from(attr_start))
);
} else {
return Err(StreamError::UnexpectedEndOfStream);
}
}
let start = s.pos();
let (prefix, local) = s.consume_qname()?;
s.consume_eq()?;
let quote = s.consume_quote()?;
let quote_c = quote as char;
let value = s.consume_chars(|_, c| c != quote_c && c != '<')?;
s.consume_byte(quote)?;
let span = s.slice_back(start);
Ok(Token::Attribute { prefix, local, value, span })
}
fn parse_text(s: &mut Stream<'a>) -> Result<Token<'a>> {
map_err_at!(Self::parse_text_impl(s), s, InvalidCharData)
}
fn parse_text_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let text = s.consume_chars(|_, c| c != '<')?;
if text.as_str().contains('>') {
if text.as_str().contains("]]>") {
return Err(StreamError::InvalidCharacterData);
}
}
Ok(Token::Text { text })
}
}
impl<'a> Iterator for Tokenizer<'a> {
type Item = Result<Token<'a>>;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
if self.stream.at_end() || self.state == State::End {
return None;
}
let t = self.parse_next_impl();
if let Some(Err(_)) = t {
self.stream.jump_to_end();
self.state = State::End;
}
t
}
}