# strtools.py
#
# Copyright 2003 Wichert Akkerman <wichert@deephackmode.org>
#
# This file is free software; you can redistribute it and/or modify it
# under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# Calculate shared library dependencies

"""String handling tools

Utility functions for some standard string processing tasks
"""

__docformat__	= "epytext en"

import codecs, re

class TokenizeError(Exception):
	"""Tokenizer error class"""
	pass


def Tokenize(str, whitespace=" \t\r\n", quotes="\"", escapes="\\"):
	"""String tokenizer

	This function tokenizes a string while taking quotation and
	escaping into account.

	  >>> import dhm.strtools
	  >>> dhm.strtools.Tokenize("this is a test")
	  ['this', 'is', 'a', 'test']
	  >>> dhm.strtools.Tokenize("this \"is a\" test")
	  ['this', 'is a', 'test']
	  >>> dhm.strtools.Tokenize("this \\\"is\\\" a test")
	  ['this', '"is"', 'a', 'test']
	  >>> dhm.strtools.Tokenize("this \"is a test")
	  Traceback (most recent call last):
	    File "<stdin>", line 1, in ?
	    File "/usr/local/lib/python2.2/site-packages/dhm/strtools.py", line 80, in Tokenize
	      raise TokenizeError, "Unexpected end of string in quoted text"
	  dhm.strtools.TokenizeError: Unexecpted end of string in quoted text

	@param        str: string to tokenize
	@type         str: string
	@param whitespace: whitespace characters seperating tokens
	@type  whitespace: string
	@param     quotes: legal quoting characters
	@type      quotes: string
	@param    escapes: characters which can escape quoting characters
	@type     escapes: string
	@return: list of tokens
	@rtype:  sequence of strings
	"""
	(buffer, tokens, curtoken, quote)=(str, [], None, None)

	try:
		while buffer:
			if buffer[0]==quote:
				quote=None
			elif (quote==None) and (buffer[0] in quotes):
				quote=buffer[0]
			elif buffer[0] in whitespace:
				if quote!=None:
					curtoken+=buffer[0]
				else:
					tokens.append(curtoken)
					curtoken=None
					while buffer[1] in whitespace:
						buffer=buffer[1:]
			elif buffer[0] in escapes:
				if curtoken==None:
					curtoken=buffer[1]
				else:
					curtoken+=buffer[1]
				buffer=buffer[1:]
			else:
				if curtoken==None:
					curtoken=buffer[0]
				else:
					curtoken+=buffer[0]

			buffer=buffer[1:]
	except IndexError:
		raise TokenizeError, "Unexpected end of string"
	
	if quote:
		raise TokenizeError, "Unexpected end of string in quoted text"

	if curtoken!=None:
		tokens.append(curtoken)

	return tokens
	

def RegexFilter(regexp, *args):
	"""Extract regexp matches from a string.

	Its can be useful to extract certain parts of a string based on
	a regular expression. This function automates that task.

	  >>> import strtools
	  >>> strtools.RegexFilter("([^=]*)=(.*)", "username=wichert", "# a comment", "password=secret")
	  [('username', 'wichert'), ('password', 'secret')]

	@param regexp: regular expression to look for
	@type  regexp: string
	@param  *args: strings to filter
	@type   *args: string argument list
	@return: selected data
	@rtype:  list of list of matched strings
	"""

	lst=[]
	matcher=re.compile(regexp)
	for str in args:
		mo=matcher.search(str)
		if mo:
			lst.append(mo.groups())
	
	return lst


def CodecFile(fo, encoding="utf=8"):
	"""Return a new file object for a special codec.

	This function wraps a file object in a StreamReaderWriter of
	a specific encoding. This is especially useful if you want to read
	data in a different encoding than the default ASCII.

	@param       fo: file to wrap
	@type        fo: file object instange
	@param encoding: name of the encoding to use
	@type  encoding: string
	@return: file object with proper encoding
	@rype: file instance
	"""

	(e,d,sr,sw)=codecs.lookup(encoding)
	srw=codecs.StreamReaderWriter(fo, sr, sw, "strict")
	srw.encoding=encoding
	return srw