Source code for pockets.string

# -*- coding: utf-8 -*-
# Copyright (c) 2018 the Pockets team, see AUTHORS.
# Licensed under the BSD License, see LICENSE for details.

"""A pocket full of useful string manipulation tools!"""

from __future__ import absolute_import, print_function

import re

import six

from pockets.collections import is_listy, listify


__all__ = [
    "camel",
    "uncamel",
    "fieldify",
    "unfieldify",
    "sluggify",
    "splitcaps",
    "splitify",
    "UnicodeMixin",
]


# Default regular expression flags
if six.PY2:
    RE_FLAGS = re.L | re.M | re.U
else:
    RE_FLAGS = re.M | re.U

RE_NONWORD = re.compile(r"[\W_]+")

RE_SPLITCAPS = re.compile(
    # Clause 1
    r"[A-Z]+[^a-z]*"  # All non-lowercase beginning with a capital letter
    r"(?=[A-Z][^A-Z]*?[a-z]|$)"  # Followed by a capitalized word
    r"|"
    # Clause 2
    r"[A-Z][^A-Z]*?[a-z]+[^A-Z]*" r"|"  # Capitalized word
    # Clause 3
    r"[^A-Z]+",  # All non-uppercase
    RE_FLAGS,
)

RE_UNCAMEL = re.compile(
    r"("  # The whole expression is in a single group
    # Clause 1
    r"(?<=[^\sA-Z])"  # Preceded by neither a space nor a capital letter
    r"[A-Z]+[^a-z\s]*"  # All non-lowercase beginning with a capital letter
    r"(?=[A-Z][^A-Z\s]*?[a-z]|\s|$)"  # Followed by a capitalized word
    r"|"
    # Clause 2
    r"(?<=[^\s])"  # Preceded by a character that is not a space
    r"[A-Z][^A-Z\s]*?[a-z]+[^A-Z\s]*"  # Capitalized word
    r")",
    RE_FLAGS,
)

RE_WHITESPACE_GROUP = re.compile(r"(\s+)", RE_FLAGS)


[docs]def camel( s, sep="_", lower_initial=False, upper_segments=None, preserve_upper=False ): """ Convert underscore_separated string (aka snake_case) to CamelCase. Works on full sentences as well as individual words: >>> camel("hello_world!") 'HelloWorld!' >>> camel("Totally works as_expected, even_with_whitespace!") 'Totally Works AsExpected, EvenWithWhitespace!' Args: sep (string, optional): Delineates segments of `s` that will be CamelCased. Defaults to an underscore "_". For example, if you want to CamelCase a dash separated word: >>> camel("xml-http-request", sep="-") 'XmlHttpRequest' lower_initial (bool, int, or list, optional): If True, the initial character of each camelCased word will be lowercase. If False, the initial character of each CamelCased word will be uppercase. Defaults to False: >>> camel("http_request http_response") 'HttpRequest HttpResponse' >>> camel("http_request http_response", lower_initial=True) 'httpRequest httpResponse' Optionally, `lower_initial` can be an int or a list of ints, indicating which individual segments of each CamelCased word should start with a lowercase. Supports negative numbers to index segments from the right: >>> camel("xml_http_request", lower_initial=0) 'xmlHttpRequest' >>> camel("xml_http_request", lower_initial=-1) 'XmlHttprequest' >>> camel("xml_http_request", lower_initial=[0, 1]) 'xmlhttpRequest' upper_segments (int or list, optional): Indicates which segments of CamelCased words should be fully uppercased, instead of just capitalizing the first letter. Can be an int, indicating a single segment, or a list of ints, indicating multiple segments. Supports negative numbers to index segments from the right. `upper_segments` is helpful when dealing with acronyms: >>> camel("tcp_socket_id", upper_segments=0) 'TCPSocketId' >>> camel("tcp_socket_id", upper_segments=[0, -1]) 'TCPSocketID' >>> camel("tcp_socket_id", upper_segments=[0, -1], lower_initial=1) 'TCPsocketID' preserve_upper (bool): If True, existing uppercase characters will not be automatically lowercased. Defaults to False. >>> camel("xml_HTTP_reQuest") 'XmlHttpRequest' >>> camel("xml_HTTP_reQuest", preserve_upper=True) 'XmlHTTPReQuest' Returns: str: CamelCased version of `s`. """ if isinstance(lower_initial, bool): lower_initial = [0] if lower_initial else [] else: lower_initial = listify(lower_initial) upper_segments = listify(upper_segments) result = [] for word in RE_WHITESPACE_GROUP.split(s): segments = [segment for segment in word.split(sep) if segment] count = len(segments) for i, segment in enumerate(segments): upper = i in upper_segments or (i - count) in upper_segments lower = i in lower_initial or (i - count) in lower_initial if upper and lower: if preserve_upper: segment = segment[0] + segment[1:].upper() else: segment = segment[0].lower() + segment[1:].upper() elif upper: segment = segment.upper() elif lower: if not preserve_upper: segment = segment.lower() elif preserve_upper: segment = segment[0].upper() + segment[1:] else: segment = segment[0].upper() + segment[1:].lower() result.append(segment) return "".join(result)
[docs]def uncamel(s, sep="_"): """ Convert CamelCase string to underscore_separated (aka snake_case). A CamelCase word is considered to be any uppercase letter followed by zero or more lowercase letters. Contiguous groups of uppercase letters – like you would find in an acronym – are also considered part of a single word: >>> uncamel("Request") 'request' >>> uncamel("HTTP") 'http' >>> uncamel("HTTPRequest") 'http_request' >>> uncamel("xmlHTTPRequest") 'xml_http_request' Works on full sentences as well as individual words: >>> uncamel("HelloWorld!") 'hello_world!' >>> uncamel("Totally works AsExpected, EvenWithWhitespace!") 'totally works as_expected, even_with_whitespace!' Args: sep (str, optional): String used to separate CamelCase words. Defaults to an underscore "_". For example, if you want dash separated words: >>> uncamel("XmlHttpRequest", sep="-") 'xml-http-request' Returns: str: uncamel_cased version of `s`. """ return RE_UNCAMEL.sub(r"{0}\1".format(sep), s).lower()
[docs]def fieldify(s, sep="_"): """ Convert a string into a valid "field-like" variable name. Converts `s` from camel case to underscores, and replaces all spaces and non-word characters with `sep`: >>> fieldify('The XmlHTTPRequest Contained, "DATA..."') 'the_xml_http_request_contained_data' Args: s (str): The string to fieldify. sep (str): The string to use as a word separator in the returned field. Defaults to '_'. Returns: str: The field version of `s`. """ if not s: return "" return RE_NONWORD.sub(sep, uncamel(s)).strip(sep)
[docs]def unfieldify(s, sep="_"): """ Makes a best effort to reverse the algorithm from `fieldify`. Replaces instances of `sep` in `s` with a space and converts the result to title case: >>> unfieldify('the_xml_http_request_contained_data') 'The Xml Http Request Contained Data' Args: s (str): The string to fieldify. sep (str): The string to consider a word separator in `s`. Defaults to '_'. Returns: str: The unfieldified version of `s`. """ if not s: return "" s = s.strip(r"{0} ".format(sep)) return (" ".join([w for w in s.split(sep) if w])).title()
[docs]def sluggify(s, sep="-"): """ Convert a string into a "slug" suitable for use in a URL. Converts `s` to lower case, and replaces all spaces and non-word characters with `sep`: >>> sluggify('The ANGRY Wizard Shouted, "HEY..."') 'the-angry-wizard-shouted-hey' Args: s (str): The string to convert into a slug. sep (str): The string to use as a word separator in the slug. Defaults to '-'. Returns: str: The sluggify version of `s`. """ if not s: return "" return RE_NONWORD.sub(sep, s).lower().strip(sep)
[docs]def splitcaps(s, pattern=None, maxsplit=None, flags=0): """ Intelligently split a string on capitalized words. A capitalized word is considered to be any uppercase letter followed by zero or more lowercase letters. Contiguous groups of uppercase letters – like you would find in an acronym – are also considered part of a single word: >>> splitcaps("Request") ['Request'] >>> splitcaps("HTTP") ['HTTP'] >>> splitcaps("HTTPRequest") ['HTTP', 'Request'] >>> splitcaps("HTTP/1.1Request") ['HTTP/1.1', 'Request'] >>> splitcaps("xmlHTTPRequest") ['xml', 'HTTP', 'Request'] If no capitalized words are found in `s`, the whole string is returned in a single element list: >>> splitcaps("") [''] >>> splitcaps("lower case words") ['lower case words'] Does not split on whitespace by default. To also split on whitespace, pass "\\\\s+" for `pattern`: >>> splitcaps("Without whiteSpace pattern") ['Without white', 'Space pattern'] >>> splitcaps("With whiteSpace pattern", pattern=r"\\s+") ['With', 'white', 'Space', 'pattern'] >>> splitcaps("With whiteSpace group", pattern=r"(\\s+)") ['With', ' ', 'white', 'Space', ' ', 'group'] Args: s (str): The string to split. pattern (str, optional): In addition to splitting on capital letters, also split by the occurrences of `pattern`. If capturing parentheses are used in `pattern`, then the text of all groups in `pattern` are also returned as part of the resulting list. Defaults to None. maxsplit (int, optional): If maxsplit is not specified or -1, then there is no limit on the number of splits (all possible splits are made). If maxsplit is >= 0, at most maxsplit splits occur, and the remainder of the string is returned as the final element of the list. flags (int, optional): Flags to pass to the regular expression created using `pattern`. Ignored if `pattern` is not specified. Defaults to (re.LOCALE | re.MULTILINE | re.UNICODE). Returns: list: List of capitalized substrings in `s`. """ if not maxsplit: if maxsplit == 0: return [s] else: maxsplit = -1 if pattern: pattern_re = re.compile(pattern, flags or RE_FLAGS) else: pattern_re = None result = [] post_maxsplit = [] for m in RE_SPLITCAPS.finditer(s): if pattern_re: for segment in pattern_re.split(m.group()): if segment: if maxsplit > 0 and len(result) >= maxsplit: post_maxsplit.append(segment) else: result.append(segment) else: result.append(m.group()) if maxsplit > 0 and len(result) >= maxsplit: if m.end() < len(s): post_maxsplit.append(s[m.end() :]) post_maxsplit = "".join(post_maxsplit) if post_maxsplit: result.append(post_maxsplit) break return result if len(result) > 0 else [s]
[docs]def splitify(value, separator=",", strip=True, include_empty=False): """ Convert a value to a list using a supercharged `split()`. If `value` is a string, it is split by `separator`. If `separator` is `None` or empty, no attempt to split is made, and `value` is returned as the only item in a list. If `strip` is `True`, then the split strings will be stripped of whitespace. If `strip` is a string, then the split strings will be stripped of the given string. If `include_empty` is `False`, then empty split strings will not be included in the returned list. If `value` is `None` an empty list is returned. If `value` is already "listy", it is returned as-is. If `value` is any other type, it is returned as the only item in a list. >>> splitify("first item, second item") ['first item', 'second item'] >>> splitify("first path: second path: :skipped empty path", ":") ['first path', 'second path', 'skipped empty path'] >>> splitify(["already", "split"]) ['already', 'split'] >>> splitify(None) [] >>> splitify(1969) [1969] """ if is_listy(value): return value if isinstance(value, str) and separator: parts = value.split(separator) if strip: strip = None if strip is True else strip parts = [s.strip(strip) for s in parts] return [s for s in parts if include_empty or s] return listify(value)
[docs]class UnicodeMixin(object): """ Mixin class to define proper __str__/__unicode__ methods in Python 2 or 3. Originally found on the `Porting Python 2 Code to Python 3 HOWTO`_. .. _Porting Python 2 Code to Python 3 HOWTO: https://docs.python.org/3.3/howto/pyporting.html """ if six.PY2: def __str__(self): return self.__unicode__().encode("utf8") else: def __str__(self): return self.__unicode__()