You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This repo is archived. You can view files and clone it, but cannot push or open issues/pull-requests.

223 lines
5.9 KiB
C++

/************************************************************************
The zlib/libpng License
Copyright (c) 2006 Joerg Wiedenmann
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from
the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented;
you must not claim that you wrote the original software.
If you use this software in a product, an acknowledgment
in the product documentation would be appreciated but is
not required.
2. Altered source versions must be plainly marked as such,
and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
***********************************************************************/
/********************************************************************
created: 2006-01-28
filename: tokenizer.cpp
author: Jörg Wiedenmann
purpose: A tokenizer function which provides a very
customizable way of breaking up strings.
history: 2006-01-28, Original version
2006-03-04, Fixed a small parsing bug, thanks Elias.
*********************************************************************/
#include "tokenizer.h"
using namespace std;
void tokenize ( const string& str, vector<string>& result,
const string& delimiters, const string& delimiters_preserve,
const string& quote, const string& esc )
{
// clear the vector
if ( false == result.empty() )
{
result.clear();
}
string::size_type pos = 0; // the current position (char) in the string
char ch = 0; // buffer for the current character
char delimiter = 0; // the buffer for the delimiter char which
// will be added to the tokens if the delimiter
// is preserved
char current_quote = 0; // the char of the current open quote
bool quoted = false; // indicator if there is an open quote
string token; // string buffer for the token
bool token_complete = false; // indicates if the current token is
// read to be added to the result vector
string::size_type len = str.length(); // length of the input-string
// for every char in the input-string
while ( len > pos )
{
// get the character of the string and reset the delimiter buffer
ch = str.at(pos);
delimiter = 0;
// assume ch isn't a delimiter
bool add_char = true;
// check ...
// ... if the delimiter is an escaped character
bool escaped = false; // indicates if the next char is protected
if ( false == esc.empty() ) // check if esc-chars are provided
{
if ( string::npos != esc.find_first_of(ch) )
{
// get the escaped char
++pos;
if ( pos < len ) // if there are more chars left
{
// get the next one
ch = str.at(pos);
// add the escaped character to the token
add_char = true;
}
else // cannot get any more characters
{
// don't add the esc-char
add_char = false;
}
// ignore the remaining delimiter checks
escaped = true;
}
}
// ... if the delimiter is a quote
if ( false == quote.empty() && false == escaped )
{
// if quote chars are provided and the char isn't protected
if ( string::npos != quote.find_first_of(ch) )
{
// if not quoted, set state to open quote and set
// the quote character
if ( false == quoted )
{
quoted = true;
current_quote = ch;
// don't add the quote-char to the token
add_char = false;
}
else // if quote is open already
{
// check if it is the matching character to close it
if ( current_quote == ch )
{
// close quote and reset the quote character
quoted = false;
current_quote = 0;
// don't add the quote-char to the token
add_char = false;
}
} // else
}
}
// ... if the delimiter isn't preserved
if ( false == delimiters.empty() && false == escaped &&
false == quoted )
{
// if a delimiter is provided and the char isn't protected by
// quote or escape char
if ( string::npos != delimiters.find_first_of(ch) )
{
// if ch is a delimiter and the token string isn't empty
// the token is complete
if ( false == token.empty() ) // BUGFIX: 2006-03-04
{
token_complete = true;
}
// don't add the delimiter to the token
add_char = false;
}
}
// ... if the delimiter is preserved - add it as a token
bool add_delimiter = false;
if ( false == delimiters_preserve.empty() && false == escaped &&
false == quoted )
{
// if a delimiter which will be preserved is provided and the
// char isn't protected by quote or escape char
if ( string::npos != delimiters_preserve.find_first_of(ch) )
{
// if ch is a delimiter and the token string isn't empty
// the token is complete
if ( false == token.empty() ) // BUGFIX: 2006-03-04
{
token_complete = true;
}
// don't add the delimiter to the token
add_char = false;
// add the delimiter
delimiter = ch;
add_delimiter = true;
}
}
// add the character to the token
if ( true == add_char )
{
// add the current char
token.push_back( ch );
}
// add the token if it is complete
if ( true == token_complete && false == token.empty() )
{
// add the token string
result.push_back( token );
// clear the contents
token.clear();
// build the next token
token_complete = false;
}
// add the delimiter
if ( true == add_delimiter )
{
// the next token is the delimiter
string delim_token;
delim_token.push_back( delimiter );
result.push_back( delim_token );
// REMOVED: 2006-03-04, Bugfix
}
// repeat for the next character
++pos;
} // while
// add the final token
if ( false == token.empty() )
{
result.push_back( token );
}
}