corso/src/pkg/path/path.go

// Package path provides a set of functions for wrangling paths from the outside
// world into paths that corso can understand. Paths use the standard Unix path
// separator character '/'. If for some reason an individual element in a raw
// path contains the '/' character, it should be escaped with '\'. If the path
// contains '\' it should be escaped by turning it into '\\'.
//
// Paths can be split into elements by splitting on '/' if the '/' is not
// escaped. Additionally, corso may operate on segments in a path. Segments are
// made up of one or more path elements.
//
// Examples of paths splitting by elements and canonicalization with escaping:
// 1.
//
//	input path: `this/is/a/path`
//	elements of path: `this`, `is`, `a`, `path`
//
// 2.
//
//	input path: `this/is\/a/path`
//	elements of path: `this`, `is/a`, `path`
//
// 3.
//
//	input path: `this/is\\/a/path`
//	elements of path: `this`, `is\`, `a`, `path`
//
// 4.
//
//	input path: `this/is\\\/a/path`
//	elements of path: `this`, `is\/a`, `path`
//
// 5.
//
//	input path: `this/is//a/path`
//	elements of path: `this`, `is`, `a`, `path`
//
// 6.
//
//	input path: `this/is\//a/path`
//	elements of path: `this`, `is/`, `a`, `path`
//
// 7.
//
//	input path: `this/is/a/path/`
//	elements of path: `this`, `is`, `a`, `path`
//
// 8.
//
//	input path: `this/is/a/path\/`
//	elements of path: `this`, `is`, `a`, `path/`
package path

import (
	"bytes"
	"crypto/sha256"
	"fmt"
	"strings"

	"github.com/alcionai/clues"
	"github.com/pkg/errors"
)

const (
	escapeCharacter = '\\'
	PathSeparator   = '/'

	shortRefCharacters = 12
)

var charactersToEscape = map[rune]struct{}{
	PathSeparator:   {},
	escapeCharacter: {},
}

var (
	errMissingSegment = errors.New("missing required path element")
	errParsingPath    = errors.New("parsing resource path")
)

// For now, adding generic functions to pull information from segments.
// Resources that don't have the requested information should return an empty
// string.
type Path interface {
	String() string
	Service() ServiceType
	Category() CategoryType
	Tenant() string
	ResourceOwner() string
	Folder(bool) string
	Folders() []string
	Item() string
	// PopFront returns a Builder object with the first element (left-side)
	// removed. As the resulting set of elements is no longer a valid resource
	// path a Builder is returned instead.
	PopFront() *Builder
	// Dir returns a Path object with the right-most element removed if possible.
	// If removing the right-most element would discard one of the required prefix
	// elements then an error is returned.
	Dir() (Path, error)
	// Elements returns all the elements in the path. This is a temporary function
	// and will likely be updated to handle encoded elements instead of clear-text
	// elements in the future.
	Elements() []string
	// Append returns a new Path object with the given element added to the end of
	// the old Path if possible. If the old Path is an item Path then Append
	// returns an error.
	Append(element string, isItem bool) (Path, error)
	// ShortRef returns a short reference representing this path. The short
	// reference is guaranteed to be unique. No guarantees are made about whether
	// a short reference can be converted back into the Path that generated it.
	ShortRef() string
	// ToBuilder returns a Builder instance that represents the current Path.
	ToBuilder() *Builder
}

// Builder is a simple path representation that only tracks path elements. It
// can join, escape, and unescape elements. Higher-level packages are expected
// to wrap this struct to build resource-speicific contexts (e.x. an
// ExchangeMailPath).
// Resource-specific paths allow access to more information like segments in the
// path. Builders that are turned into resource paths later on do not need to
// manually add prefixes for items that normally appear in the data layer (ex.
// tenant ID, service, user ID, etc).
type Builder struct {
	// Unescaped version of elements.
	elements []string
}

// UnescapeAndAppend creates a copy of this Builder and adds one or more already
// escaped path elements to the end of the new Builder. Elements are added in
// the order they are passed.
func (pb Builder) UnescapeAndAppend(elements ...string) (*Builder, error) {
	res := &Builder{elements: make([]string, 0, len(pb.elements))}
	copy(res.elements, pb.elements)

	if err := res.appendElements(true, elements); err != nil {
		return nil, err
	}

	return res, nil
}

// SplitUnescapeAppend takes in an escaped string representing a directory
// path, splits the string, and appends it to the current builder.
func (pb Builder) SplitUnescapeAppend(s string) (*Builder, error) {
	elems := Split(TrimTrailingSlash(s))

	return pb.UnescapeAndAppend(elems...)
}

// Append creates a copy of this Builder and adds the given elements them to the
// end of the new Builder. Elements are added in the order they are passed.
func (pb Builder) Append(elements ...string) *Builder {
	res := &Builder{elements: make([]string, len(pb.elements))}
	copy(res.elements, pb.elements)

	// Unescaped elements can't fail validation.
	//nolint:errcheck
	res.appendElements(false, elements)

	return res
}

func (pb *Builder) appendElements(escaped bool, elements []string) error {
	for _, e := range elements {
		if len(e) == 0 {
			continue
		}

		tmp := e

		if escaped {
			tmp = TrimTrailingSlash(tmp)
			// If tmp was just the path separator then it will be empty now.
			if len(tmp) == 0 {
				continue
			}

			if err := validateEscapedElement(tmp); err != nil {
				return err
			}

			tmp = unescape(tmp)
		}

		pb.elements = append(pb.elements, tmp)
	}

	return nil
}

func (pb Builder) PopFront() *Builder {
	if len(pb.elements) <= 1 {
		return &Builder{}
	}

	elements := make([]string, len(pb.elements)-1)
	copy(elements, pb.elements[1:])

	return &Builder{
		elements: elements,
	}
}

func (pb Builder) Dir() *Builder {
	if len(pb.elements) <= 1 {
		return &Builder{}
	}

	return &Builder{
		// Safe to use the same elements because Builders are immutable.
		elements: pb.elements[:len(pb.elements)-1],
	}
}

func (pb Builder) LastElem() string {
	if len(pb.elements) == 0 {
		return ""
	}

	return pb.elements[len(pb.elements)-1]
}

// String returns a string that contains all path elements joined together.
// Elements of the path that need escaping are escaped.
func (pb Builder) String() string {
	escaped := make([]string, 0, len(pb.elements))

	for _, e := range pb.elements {
		escaped = append(escaped, escapeElement(e))
	}

	return join(escaped)
}

func (pb Builder) ShortRef() string {
	if len(pb.elements) == 0 {
		return ""
	}

	data := bytes.Buffer{}

	for _, element := range pb.elements {
		data.WriteString(element)
	}

	sum := sha256.Sum256(data.Bytes())

	// Some conversions to get the right number of characters in the output. This
	// outputs hex, so we need to take the target number of characters and do the
	// equivalent of (shortRefCharacters * 4) / 8. This is
	// <number of bits represented> / <bits per byte> which gets us how many bytes
	// to give to our format command.
	numBytes := shortRefCharacters / 2

	return fmt.Sprintf("%x", sum[:numBytes])
}

// Elements returns all the elements in the path. This is a temporary function
// and will likely be updated to handle encoded elements instead of clear-text
// elements in the future.
func (pb Builder) Elements() []string {
	return append([]string{}, pb.elements...)
}

func verifyInputValues(tenant, resourceOwner string) error {
	if len(tenant) == 0 {
		return clues.Stack(errMissingSegment, errors.New("tenant"))
	}

	if len(resourceOwner) == 0 {
		return clues.Stack(errMissingSegment, errors.New("resourceOwner"))
	}

	return nil
}

func (pb Builder) verifyPrefix(tenant, resourceOwner string) error {
	if err := verifyInputValues(tenant, resourceOwner); err != nil {
		return err
	}

	if len(pb.elements) == 0 {
		return errors.New("missing path beyond prefix")
	}

	return nil
}

func (pb Builder) withPrefix(elements ...string) *Builder {
	res := Builder{}.Append(elements...)
	res.elements = append(res.elements, pb.elements...)

	return res
}

func (pb Builder) ToStreamStorePath(
	tenant, purpose string,
	service ServiceType,
	isItem bool,
) (Path, error) {
	if err := verifyInputValues(tenant, purpose); err != nil {
		return nil, err
	}

	if isItem && len(pb.elements) == 0 {
		return nil, errors.New("missing path beyond prefix")
	}

	metadataService := UnknownService

	switch service {
	case ExchangeService:
		metadataService = ExchangeMetadataService
	case OneDriveService:
		metadataService = OneDriveMetadataService
	case SharePointService:
		metadataService = SharePointMetadataService
	}

	return &dataLayerResourcePath{
		Builder: *pb.withPrefix(
			tenant,
			metadataService.String(),
			purpose,
			DetailsCategory.String(),
		),
		service:  metadataService,
		category: DetailsCategory,
		hasItem:  isItem,
	}, nil
}

func (pb Builder) ToServiceCategoryMetadataPath(
	tenant, user string,
	service ServiceType,
	category CategoryType,
	isItem bool,
) (Path, error) {
	if err := validateServiceAndCategory(service, category); err != nil {
		return nil, err
	}

	if err := verifyInputValues(tenant, user); err != nil {
		return nil, err
	}

	if isItem && len(pb.elements) == 0 {
		return nil, errors.New("missing path beyond prefix")
	}

	metadataService := UnknownService

	switch service {
	case ExchangeService:
		metadataService = ExchangeMetadataService
	case OneDriveService:
		metadataService = OneDriveMetadataService
	case SharePointService:
		metadataService = SharePointMetadataService
	}

	return &dataLayerResourcePath{
		Builder: *pb.withPrefix(
			tenant,
			metadataService.String(),
			user,
			category.String(),
		),
		service:  metadataService,
		category: category,
		hasItem:  isItem,
	}, nil
}

func (pb Builder) ToDataLayerPath(
	tenant, user string,
	service ServiceType,
	category CategoryType,
	isItem bool,
) (Path, error) {
	if err := validateServiceAndCategory(service, category); err != nil {
		return nil, err
	}

	if err := pb.verifyPrefix(tenant, user); err != nil {
		return nil, err
	}

	return &dataLayerResourcePath{
		Builder: *pb.withPrefix(
			tenant,
			service.String(),
			user,
			category.String(),
		),
		service:  service,
		category: category,
		hasItem:  isItem,
	}, nil
}

func (pb Builder) ToDataLayerExchangePathForCategory(
	tenant, user string,
	category CategoryType,
	isItem bool,
) (Path, error) {
	return pb.ToDataLayerPath(tenant, user, ExchangeService, category, isItem)
}

func (pb Builder) ToDataLayerOneDrivePath(
	tenant, user string,
	isItem bool,
) (Path, error) {
	return pb.ToDataLayerPath(tenant, user, OneDriveService, FilesCategory, isItem)
}

func (pb Builder) ToDataLayerSharePointPath(
	tenant, site string,
	category CategoryType,
	isItem bool,
) (Path, error) {
	return pb.ToDataLayerPath(tenant, site, SharePointService, category, isItem)
}

// FromDataLayerPath parses the escaped path p, validates the elements in p
// match a resource-specific path format, and returns a Path struct for that
// resource-specific type. If p does not match any resource-specific paths or
// is malformed returns an error.
func FromDataLayerPath(p string, isItem bool) (Path, error) {
	p = TrimTrailingSlash(p)
	// If p was just the path separator then it will be empty now.
	if len(p) == 0 {
		return nil, clues.New("logically empty path given").With("path_string", p)
	}

	// Turn into a Builder to reuse code that ignores empty elements.
	pb, err := Builder{}.UnescapeAndAppend(Split(p)...)
	if err != nil {
		return nil, clues.Stack(errParsingPath, err).With("path_string", p)
	}

	if len(pb.elements) < 5 {
		return nil, clues.New("path has too few segments").With("path_string", p)
	}

	service, category, err := validateServiceAndCategoryStrings(
		pb.elements[1],
		pb.elements[3],
	)
	if err != nil {
		return nil, clues.Stack(errParsingPath, err).With("path_string", p)
	}

	return &dataLayerResourcePath{
		Builder:  *pb,
		service:  service,
		category: category,
		hasItem:  isItem,
	}, nil
}

// escapeElement takes a single path element and escapes all characters that
// require an escape sequence. If there are no characters that need escaping,
// the input is returned unchanged.
func escapeElement(element string) string {
	escapeIdx := make([]int, 0)

	for i, c := range element {
		if _, ok := charactersToEscape[c]; ok {
			escapeIdx = append(escapeIdx, i)
		}
	}

	if len(escapeIdx) == 0 {
		return element
	}

	startIdx := 0
	b := strings.Builder{}
	b.Grow(len(element) + len(escapeIdx))

	for _, idx := range escapeIdx {
		b.WriteString(element[startIdx:idx])
		b.WriteRune(escapeCharacter)

		startIdx = idx
	}

	// Add the end of the element after the last escape character.
	b.WriteString(element[startIdx:])

	return b.String()
}

// unescape returns the given element and converts it into a "raw"
// element that does not have escape characters before characters that need
// escaping. Using this function on segments that contain escaped path
// separators will result in an ambiguous or incorrect segment.
func unescape(element string) string {
	b := strings.Builder{}
	startIdx := 0
	prevWasEscape := false

	for i, c := range element {
		if c != escapeCharacter || prevWasEscape {
			prevWasEscape = false
			continue
		}

		// This is an escape character, remove it from the output.
		b.WriteString(element[startIdx:i])
		startIdx = i + 1
		prevWasEscape = true
	}

	b.WriteString(element[startIdx:])

	return b.String()
}

// validateEscapedElement takes an escaped element that has had trailing
// separators trimmed and ensures that no characters requiring escaping are
// unescaped and that no escape characters are combined with characters that
// don't need escaping.
func validateEscapedElement(element string) error {
	prevWasEscape := false

	for _, c := range element {
		switch prevWasEscape {
		case true:
			prevWasEscape = false

			if _, ok := charactersToEscape[c]; !ok {
				return clues.New("bad escape sequence in path").
					With("escape_sequence", fmt.Sprintf("'%c%c'", escapeCharacter, c))
			}

		case false:
			if c == escapeCharacter {
				prevWasEscape = true
				continue
			}

			if _, ok := charactersToEscape[c]; ok {
				return clues.New("unescaped character in path").With("character", c)
			}
		}
	}

	if prevWasEscape {
		return errors.New("trailing escape character")
	}

	return nil
}

// TrimTrailingSlash takes an escaped path element and returns an escaped path
// element with the trailing path separator character(s) removed if they were not
// escaped. If there were no trailing path separator character(s) or the separator(s)
// were escaped the input is returned unchanged.
func TrimTrailingSlash(element string) string {
	for len(element) > 0 && element[len(element)-1] == PathSeparator {
		lastIdx := len(element) - 1
		numSlashes := 0

		for i := lastIdx - 1; i >= 0; i-- {
			if element[i] != escapeCharacter {
				break
			}

			numSlashes++
		}

		if numSlashes%2 != 0 {
			break
		}

		element = element[:lastIdx]
	}

	return element
}

// join returns a string containing the given elements joined by the path
// separator '/'.
func join(elements []string) string {
	// Have to use strings because path package does not handle escaped '/' and
	// '\' according to the escaping rules.
	return strings.Join(elements, string(PathSeparator))
}

// split takes an escaped string and returns a slice of path elements. The
// string is split on the path separator according to the escaping rules. The
// provided string must not contain an unescaped trailing path separator.
func Split(segment string) []string {
	res := make([]string, 0)
	numEscapes := 0
	startIdx := 0
	// Start with true to ignore leading separator.
	prevWasSeparator := true

	for i, c := range segment {
		if c == escapeCharacter {
			prevWasSeparator = false
			numEscapes++

			continue
		}

		if c != PathSeparator {
			prevWasSeparator = false
			numEscapes = 0

			continue
		}

		// Remaining is just path separator handling.
		if numEscapes%2 != 0 {
			// This is an escaped separator.
			prevWasSeparator = false
			numEscapes = 0

			continue
		}

		// Ignore leading separator characters and don't add elements that would
		// be empty.
		if !prevWasSeparator {
			res = append(res, segment[startIdx:i])
		}

		// We don't want to include the path separator in the result.
		startIdx = i + 1
		prevWasSeparator = true
		numEscapes = 0
	}

	// Add the final segment because the loop above won't catch it. There should
	// be no trailing separator character.
	res = append(res, segment[startIdx:])

	return res
}