Source code for ClearMap.Utils.TagExpression

# -*- coding: utf-8 -*-
"""
TagExpression
=============

Module providing routines to check and convert between tag expressions.
This simplfies the handling of list of files from using regular expressions
to tag expression. 

A tag is a simple placeholder for a string or number in a file name.
An expression is a filename with any number of tags.

A tag has the format <Name,Type,Width>. 
  * Name : str that specifies the tag name
  * Type : 'I' or 'S' for integer or string, optional and defaults to 'I'
  * Width : int, if given indicates a fixed width  and missing digist or chars
    are replaced by trailing zeros or spaces.

The module also provides functions to infer the tag expressions from a list of
file names via the function :func:`~ClearMap.Utils.TagExpression.detect`.

Expressions can also be converted to glob or regular expressions.


Example
-------
An example tag expression would be 'file_<X,2>_<Y,3>.npy' and would for example
match the filename 'file_01_013.npy'

>>> import ClearMap.Utils.TagExpression as te
>>> e = te.Expression('file_<X,2>_<Y,3>.npy')
>>> e.tag_names()
e.tag_names()

>>> e.glob()
'file_[0-9][0-9]_[0-9][0-9].npy'

>>> e.indices('file_01_013.npy')
[1, 13]

>>> e.re()
'file_(?P<X>\\d{2})_(?P<Y>\\d{3})\\.npy'

>>> e.string({'X':1, 'Y':10})
'file_01_010.npy'

See also
--------
:mod:`~ClearMap.Utils.RegularExpression`
"""
__author__    = 'Christoph Kirst <christoph.kirst.ck@gmail.com>'
__license__   = 'GPLv3 - GNU General Pulic License v3 (see LICENSE.txt)'
__copyright__ = 'Copyright © 2020 by Christoph Kirst'
__webpage__   = 'http://idisco.info'
__download__  = 'http://www.github.com/ChristophKirst/ClearMap2'


import copy

import re

TAG_START = '<';
TAG_END   = '>';
TAG_SEPARATOR = ',';
TAG_INT = 'I'; 
TAG_STR = 'S';


[docs]
def ttype_to_dtype(ttype):
  if ttype == TAG_INT or ttype is None:
    return int
  elif ttype == TAG_STR:
    return str;
  else:
    raise ValueError('The specified tag type %r is not valid!' % ttype);



[docs]
def default_tag_name(index = None):
  tag = 'Tag';
  if index is not None:
    tag += '%d' % index;
  return tag;




[docs]
class Tag(object):
  def __init__(self, tag = None, name = None, ttype = TAG_INT, width = None, reference = None, trange = None):
    if tag is not None:
      self.parse(tag);
    else:
      self.name = name;
      self.ttype = ttype;
      self.width = width;
      self.reference = reference;
      self.trange = trange; 
  

[docs]
  def label(self, index = None):
    if self.name is not None:
      return self.name;
    else:
      return default_tag_name(index = index)

  

[docs]
  def dtype(self):
    return ttype_to_dtype(self.ttype);



[docs]
  def tag(self):
    t = TAG_START;
    if self.name is not None:
      t += self.name + TAG_SEPARATOR;
    if self.ttype is not None:
      t += self.ttype + TAG_SEPARATOR;
    if self.width is not None:
      t += str(self.width) + TAG_SEPARATOR;
    if len(t) > len(TAG_START):
      t = t[:-len(TAG_SEPARATOR)];
    t += TAG_END;
    return t;

  

[docs]
  def glob(self):
    e = '';
    if self.width != None:
      if self.ttype == TAG_INT or self.ttype is None:
        s = '[0-9]'
      elif self.ttype == TAG_STR:
        s = '?'
      e += ''.join([s] * self.width);
    else:
      e += '*';
    return e;



[docs]
  def re(self, name = None):
    if name is None:
      name = self.name;
    e = '';
    if self.reference:
      if name is not None:
        e += '(?P=' + name + ')';
      else:
        raise ValueError('Name needs to be given for a referenced tag!');
    else:
      if name is not None:
        e += '(?P<' + name + '>';
      else:
        e += '('
      if self.ttype == TAG_INT or self.ttype is None:
        e += '\d';
      else: # self.ttype == TAG_STR:
        e += '.';
      if self.width is not None:
        e += '{' + str(self.width) + '}';
      else:
        e += '*?';
      e += ')'
    return e;

    

[docs]
  def string(self, value = None):
    if value is None:
      return self.tag();
    
    if self.width is None:
      if self.ttype == TAG_INT or self.ttype is None:
        frmt = '%d';
      else:
        frmt = '%s';
    else:
      if self.ttype == TAG_INT or self.ttype is None:
        frmt = '%0' + str(self.width) + 'd';
      else:
        frmt = '%' + str(self.width) + 's';
    return frmt % value;

  
  

[docs]
  def string_from_index(self, index = None):
    if index is not None:
      if self.trange is not None:
        value = self.trange[index];
      else:
        value = index;
    else:
      value = None;
    return self.string(value=value);

  
  

[docs]
  def value(self, string):
    return self.dtype()(string);

  
    

[docs]
  def index(self, value):
    if self.trange is None:
      if isinstance(value, str):
        raise IndexError('No range to determine index for tag %r and value %r!' % (self, value));
      else:
        return value;
    else:
      for i,r in enumerate(self.trange):
        if value == r:
          return i;
      raise IndexError('Value %r not in tag range %r!' % (value, self.trange));

  

[docs]
  def parse(self, tag):
    if len(tag) < len(TAG_START) + len(TAG_END):
      raise ValueError('The string %s is not a valid tag!' % tag)
    if tag[:len(TAG_START)] != TAG_START:
      raise ValueError('Expecting the tag to start with %s found %s!' % (TAG_START, tag[:len(TAG_START)]));
    if tag[-len(TAG_END):] != TAG_END:
      raise ValueError('Expecting the tag to end with %s found %s!' % (TAG_END, tag[-len(TAG_END):]));
    tag = tag[len(TAG_START):-len(TAG_END)];
    if len(tag) == 0:
      self.__init__();
      return;
    values = tag.split(TAG_SEPARATOR);
    if len(values) > 3:
      raise ValueError('Found %d > %d tag specifications!' % (len(values), 3));
    
    name = [];
    ttype = [];
    width = [];
    for i,v in enumerate(values):  
      try:
        if i == 0: #dont expect a width specification in the first entry
          raise Exception;
        width.append(int(v));
      except:
        if v in [TAG_INT, TAG_STR]:
          ttype.append(v);
        elif len(v) == 0:
          raise ValueError('Found two tag separators without a value in between!');
        else:
          name.append(v);
    
    if len(name) > 1:
      raise ValueError('More than one name found in tag: %r' % name);
    if len(ttype) > 1:
      raise ValueError('More than one type found in tag: %r' % ttype);
    if len(width) > 1:
      raise ValueError('More than one widht found in tag: %r' % width);
    
    name = name[0] if len(name) == 1 else None;
    ttype = ttype[0] if len(ttype) == 1 else None;
    width = width[0] if len(width) == 1 else None;
    
    self.__init__(name=name, ttype=ttype, width=width);


  def __str__(self):
    return self.tag();
  
  def __repr__(self):
    return self.__str__();




[docs]
class Expression(object):
  def __init__(self, pattern = None):
    if isinstance(pattern, Expression):
      self.pattern = copy.copy(pattern.pattern);
      self.tags = copy.copy(pattern.tags);
    elif isinstance(pattern, str):
      self.parse(pattern);
    else:
      if pattern is None:
        pattern = [];
      self.pattern = pattern;
      self.tags = [p for p in pattern if isinstance(p, Tag) and not p.reference];
   

[docs]
  def tag(self):
    e = '';
    for p in self.pattern:
      if isinstance(p, Tag):
        e += p.tag();
      else:
        e += p;
    return e;

  

[docs]
  def re(self):
    e = '';
    n_tag = 0;
    for p in self.pattern:
      if isinstance(p, Tag):
        e += p.re(name = p.label(n_tag));
        n_tag +=1;
      else:
        e += re.escape(p);
    return e;  

    

[docs]
  def glob(self, values = None):
    e = '';
    n_tag = 0;
    for p in self.pattern:
      if isinstance(p, Tag):
        if values is None:
          e += p.glob();
        else:
          lab = p.label(n_tag);
          if lab in values.keys():
            e += escape_glob(p.string(value = values[lab]));
          else:
            e += p.glob();
        n_tag += 1;
      else:
        e += escape_glob(p);
    return e;

    

[docs]
  def string(self, values = None):
    e = '';
    n_tag = 0;
    for p in self.pattern:
      if isinstance(p, Tag):
        if values is not None:
          v =  values.get(p.label(n_tag), None);
        else:
          v = None;
        e += p.string(value = v);
        n_tag += 1;
      else:
        e += p;
    return e;

    

[docs]
  def values(self, string):
    tags = self.tags;
    search = re.compile(self.re()).search;
    match = search(string);
    if match is None:
      return {};
    else:
      d = match.groupdict();
      for k,v in d.items():
        for i,t in enumerate(tags):
          if k == t.label(i):
            v = t.dtype()(v);
            d[k] = v;
            break;
      return d;

    

[docs]
  def string_from_index(self, indices):
    if not isinstance(indices, dict):
      indices = {t.label(i) : indices[i] for i,t in enumerate(self.tags)};
    e = '';
    n_tag = 0;
    for p in self.pattern:
      if isinstance(p, Tag):
        e += p.string_from_index(index = indices[p.label(n_tag)]);
        n_tag += 1;
      else:
        e += p;
    return e;

    

[docs]
  def indices(self, string):
    tags = self.tags;
    search = re.compile(self.re()).search;
    match = search(string);
    if match is None:
      raise ValueError('Cannot infer indices from string!')
    else:
      d = match.groupdict();
      for k,v in d.items():
        for i,t in enumerate(tags):
          if k == t.label(i):
            v = t.index(t.dtype()(v));
            d[k] = v;
            break;
    indices = [d[t.label(i)] for t in tags];
    return indices;

  

[docs]
  def tag_names(self):
    return [t.label(i) for i,t in enumerate(self.tags)];

  

[docs]
  def ntags(self):
    return len(self.tags); 

  
  def __getitem__(self, i):
    if isinstance(i, int):
      return self.tags[i];
    else:
      for j,n in enumerate(self.tag_names()):
        if n == i:
          return self.tags[j];
      raise IndexError('No tag with name %r!' % i);
  

[docs]
  def parse(self, expression):
    p = re.compile(TAG_START + '.*?' + TAG_END);
    pattern = [];
    tags = [];
    start = 0;
    for match in p.finditer(expression):
      if match.start() > start:
        pattern.append(expression[start:match.start()]);
      tag = Tag(tag = match.group())
      pattern.append(tag);
      tags.append(tag);
      start = match.end();
    if start < len(expression):
      pattern.append(expression[start:]);
    
    #check for references
    for i,t in enumerate(tags):
      if t.name is not None and t.reference is not True:
        refs = [r for r in tags[i+1:] if r.name == t.name];
        for r in refs:
          r.reference = True;
          if r.ttype is None:
            r.ttype = t.ttype;
          elif r.ttype != t.ttype:
            raise ValueError('The reference %r has not the same type as the tag %r!' % (r,t));
          if r.width is None:
            r.width = t.width;
    
    self.pattern = pattern;
    self.tags = [q for q in pattern if isinstance(q, Tag) and not q.reference];

  
  

[docs]
  def detect(self, strings, names = None, max_check = None, with_trange = False):
    if not isinstance(strings,list):
      strings = [strings];
    
    ls = [len(s) for s in strings];
    for l in ls:
      if l != ls[0]:
        raise ValueError('Cannot infer tag expression from strings of different length!');
      
    if max_check is None:
      max_check = len(strings);         
      
    if names is None:
      names = [];
    
    #detect differences in filenames
    s0 = strings[0];
    tags = [];
    tag_start = -1;
    tag_end = -1;
    for i, c in enumerate(s0):
      same = True;
      for s in strings[1:]:
        if s[i] != c:
          if i == tag_end:
            tag_end += 1;
          else:
            tag_start = i;
            tag_end   = i + 1;
          break;
      if same and tag_start != -1:
        tags.append((tag_start, tag_end));
        tag_start = -1; tag_end = -1;
  
    #detect trailing zeros  
    tags_full = [];
    for t in tags:
      s,e = t;
      while s > 0 and s0[s-1] == '0':
        s -= 1;
      tags_full.append((s,e));
    tags = tags_full;  
    
    #infer pattern
    pattern = [];
    p = 0;
    for i,t in enumerate(tags):
      s,e = t; 
      if s-p > 0:
        pattern.append(s0[p:s]);
        p = e;
      
      ttype = TAG_INT;
      values = [];
      for s in strings[:max_check]:
        v = s[t[0]:t[1]];
        try:
          v = int(v);
        except:
          ttype = TAG_STR;
          if not with_trange:
            break;
        values.append(v);
      if len(names) > 0:
        name = names.pop(0);
      else:
        name = default_tag_name(i);
      trange = values if with_trange else None;
      pattern.append(Tag(name = name, ttype = ttype, width = t[1] - t[0], trange = trange));
    if p < len(s0) > 0:
        pattern.append(s0[p:]); 
    
    self.pattern = pattern;
    self.tags = [q for q in pattern if isinstance(q, Tag) and not q.reference];

  
  
  def __str__(self):
    return 'TagExpression(' + self.tag() + ')';
    
  def __repr__(self):
    return self.__str__();

      


[docs]
def parse(expression):
  e = Expression()
  e.parse(expression=expression);
  return e;




[docs]
def detect(strings, names = None, max_check = None, with_trange = False):
  e = Expression();
  e.detect(strings=strings, names=names, max_check=max_check, with_trange=with_trange);
  return e;




[docs]
def escape_glob(string):
  e = '';
  for c in string:
    if c in '?[]':
      e += '[' + c + ']';
    else:
      e += c;
  return e;





def _test():
  """Tests"""
  import ClearMap.Utils.TagExpression as te
  #reload(te);  
  
  #values and strings
  t = te.parse('/test/test<X,I,4>_<Y,I,3>_<X>.tif')
  
  s = '/test/test0010_013_0010.tif';
  v = t.values(s);
  s2 = t.string(v)
  s == s2
  
  t.string({'X' : 111})
  
  #indices
  t.indices(s)
  t = te.parse('/test/test<X,I,4>_<Y,S>.tif');
  t['X']
  t['Y'].trange = list('abcd')
  t.string_from_index([0,2])  
  
  #glob
  import ClearMap.Tests.Files as tf
  import glob
  
  s = tf.io.join(tf.tif_sequence, 'sequence<I,4>.tif')
  t = te.parse(s)
  f = glob.glob(t.glob())
  
  #detection
  te.detect(f, names = ['X'])
  
  t = te.detect(f, names = ['X'], with_trange = True);
  t['X'].trange