/************************************************************************ The zlib/libpng License Copyright (c) 2006 Joerg Wiedenmann This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. ***********************************************************************/ /******************************************************************** created: 2006-01-28 filename: tokenizer.cpp author: Jörg Wiedenmann purpose: A tokenizer function which provides a very customizable way of breaking up strings. history: 2006-01-28, Original version 2006-03-04, Fixed a small parsing bug, thanks Elias. *********************************************************************/ #include "tokenizer.h" using namespace std; void tokenize ( const string& str, vector& result, const string& delimiters, const string& delimiters_preserve, const string& quote, const string& esc ) { // clear the vector if ( false == result.empty() ) { result.clear(); } string::size_type pos = 0; // the current position (char) in the string char ch = 0; // buffer for the current character char delimiter = 0; // the buffer for the delimiter char which // will be added to the tokens if the delimiter // is preserved char current_quote = 0; // the char of the current open quote bool quoted = false; // indicator if there is an open quote string token; // string buffer for the token bool token_complete = false; // indicates if the current token is // read to be added to the result vector string::size_type len = str.length(); // length of the input-string // for every char in the input-string while ( len > pos ) { // get the character of the string and reset the delimiter buffer ch = str.at(pos); delimiter = 0; // assume ch isn't a delimiter bool add_char = true; // check ... // ... if the delimiter is an escaped character bool escaped = false; // indicates if the next char is protected if ( false == esc.empty() ) // check if esc-chars are provided { if ( string::npos != esc.find_first_of(ch) ) { // get the escaped char ++pos; if ( pos < len ) // if there are more chars left { // get the next one ch = str.at(pos); // add the escaped character to the token add_char = true; } else // cannot get any more characters { // don't add the esc-char add_char = false; } // ignore the remaining delimiter checks escaped = true; } } // ... if the delimiter is a quote if ( false == quote.empty() && false == escaped ) { // if quote chars are provided and the char isn't protected if ( string::npos != quote.find_first_of(ch) ) { // if not quoted, set state to open quote and set // the quote character if ( false == quoted ) { quoted = true; current_quote = ch; // don't add the quote-char to the token add_char = false; } else // if quote is open already { // check if it is the matching character to close it if ( current_quote == ch ) { // close quote and reset the quote character quoted = false; current_quote = 0; // don't add the quote-char to the token add_char = false; } } // else } } // ... if the delimiter isn't preserved if ( false == delimiters.empty() && false == escaped && false == quoted ) { // if a delimiter is provided and the char isn't protected by // quote or escape char if ( string::npos != delimiters.find_first_of(ch) ) { // if ch is a delimiter and the token string isn't empty // the token is complete if ( false == token.empty() ) // BUGFIX: 2006-03-04 { token_complete = true; } // don't add the delimiter to the token add_char = false; } } // ... if the delimiter is preserved - add it as a token bool add_delimiter = false; if ( false == delimiters_preserve.empty() && false == escaped && false == quoted ) { // if a delimiter which will be preserved is provided and the // char isn't protected by quote or escape char if ( string::npos != delimiters_preserve.find_first_of(ch) ) { // if ch is a delimiter and the token string isn't empty // the token is complete if ( false == token.empty() ) // BUGFIX: 2006-03-04 { token_complete = true; } // don't add the delimiter to the token add_char = false; // add the delimiter delimiter = ch; add_delimiter = true; } } // add the character to the token if ( true == add_char ) { // add the current char token.push_back( ch ); } // add the token if it is complete if ( true == token_complete && false == token.empty() ) { // add the token string result.push_back( token ); // clear the contents token.clear(); // build the next token token_complete = false; } // add the delimiter if ( true == add_delimiter ) { // the next token is the delimiter string delim_token; delim_token.push_back( delimiter ); result.push_back( delim_token ); // REMOVED: 2006-03-04, Bugfix } // repeat for the next character ++pos; } // while // add the final token if ( false == token.empty() ) { result.push_back( token ); } }