Skip to content

Reference#

This part of the documentation covers the public interface of itembed.

Preprocessing tools#

A few helpers are provided to clean the data and convert to the expected format.

index_batch_stream #

index_batch_stream(num_index, batch_size)

Indices generator.

Source code in src/itembed/util.py
def index_batch_stream(num_index, batch_size):
    """Indices generator."""

    assert num_index > 0
    assert batch_size > 0

    indices = np.arange(num_index, dtype=np.int32)
    # TODO improve this algorithm

    # Replicate indices, to cover at least one batch
    if num_index < batch_size:
        repeat = batch_size // num_index + 1
        indices = np.tile(indices, repeat)
        num_index *= repeat

    # Loop forever
    while True:
        np.random.shuffle(indices)
        i = 0
        while i + batch_size <= num_index:
            yield indices[i : i + batch_size]
            i += batch_size

pack_itemsets #

pack_itemsets(itemsets, *, min_count=1, min_length=1)

Convert itemset collection to packed indices.

Parameters:

Name Type Description Default
itemsets

List of sets of hashable objects.

required
min_count

Minimal frequency count to be kept.

1
min_length

Minimal itemset length.

1

Returns:

Name Type Description
labels list of object

Mapping from indices to labels.

indices (int32, num_item)

Packed index array.

offsets (int32, num_itemset + 1)

Itemsets offsets in packed array.

Examples:

>>> itemsets = [
...     ["apple"],
...     ["apple", "sugar", "flour"],
...     ["pear", "sugar", "flour", "butter"],
...     ["apple", "pear", "sugar", "butter", "cinnamon"],
...     ["salt", "flour", "oil"],
... ]
>>> pack_itemsets(itemsets, min_length=2)
(['apple', 'sugar', 'flour', 'pear', 'butter', 'cinnamon', 'salt', 'oil'],
 array([0, 1, 2, 3, 1, 2, 4, 0, 3, 1, 4, 5, 6, 2, 7]),
 array([ 0,  3,  7, 12, 15]))
Source code in src/itembed/util.py
def pack_itemsets(itemsets, *, min_count=1, min_length=1):
    """Convert itemset collection to packed indices.

    Parameters
    ----------
    itemsets: list of list of object
        List of sets of hashable objects.
    min_count: int, optional
        Minimal frequency count to be kept.
    min_length: int, optional
        Minimal itemset length.

    Returns
    -------
    labels: list of object
        Mapping from indices to labels.
    indices: int32, num_item
        Packed index array.
    offsets: int32, num_itemset + 1
        Itemsets offsets in packed array.

    Examples
    --------
    >>> itemsets = [
    ...     ["apple"],
    ...     ["apple", "sugar", "flour"],
    ...     ["pear", "sugar", "flour", "butter"],
    ...     ["apple", "pear", "sugar", "butter", "cinnamon"],
    ...     ["salt", "flour", "oil"],
    ... ]
    >>> pack_itemsets(itemsets, min_length=2)
    (['apple', 'sugar', 'flour', 'pear', 'butter', 'cinnamon', 'salt', 'oil'],
     array([0, 1, 2, 3, 1, 2, 4, 0, 3, 1, 4, 5, 6, 2, 7]),
     array([ 0,  3,  7, 12, 15]))

    """

    # Count labels
    counter = Counter()
    for itemset in itemsets:
        counter.update(itemset)
    if None in counter:
        del counter[None]

    # Define label list
    labels = [l for l, c in counter.most_common() if c >= min_count]
    label_map = {l: i for i, l in enumerate(labels)}

    # Generate indices
    indices = []
    offsets = [0]
    for itemset in itemsets:
        itemset_indices = []
        for label in itemset:
            index = label_map.get(label)
            if index is not None:
                itemset_indices.append(index)
        if len(itemset_indices) >= min_length:
            indices.extend(itemset_indices)
            offsets.append(len(indices))

    # Convert to arrays
    indices = np.array(indices, dtype=np.int32)
    offsets = np.array(offsets, dtype=np.int32)
    return labels, indices, offsets

prune_itemsets #

prune_itemsets(indices, offsets, *, mask=None, min_length=None)

Filter packed indices.

Either an explicit mask or a length threshold must be defined.

Parameters:

Name Type Description Default
indices

Packed index array.

required
offsets

Itemsets offsets in packed array.

required
mask

Boolean mask.

None
min_length

Minimum length, inclusive.

None

Returns:

Name Type Description
indices (int32, num_item)

Packed index array.

offsets (int32, num_itemset + 1)

Itemsets offsets in packed array.

Examples:

>>> indices = np.array([0, 0, 1, 0, 1, 2, 0, 1, 2, 3])
>>> offsets = np.array([0, 1, 3, 6, 10])
>>> mask = np.array([True, True, False, True])
>>> prune_itemsets(indices, offsets, mask=mask, min_length=2)
(array([0, 1, 0, 1, 2, 3]), array([0, 2, 6]))
Source code in src/itembed/util.py
def prune_itemsets(indices, offsets, *, mask=None, min_length=None):
    """Filter packed indices.

    Either an explicit mask or a length threshold must be defined.

    Parameters
    ----------
    indices: int32, num_item
        Packed index array.
    offsets: int32, num_itemset + 1
        Itemsets offsets in packed array.
    mask: bool, num_itemset
        Boolean mask.
    min_length: int
        Minimum length, inclusive.

    Returns
    -------
    indices: int32, num_item
        Packed index array.
    offsets: int32, num_itemset + 1
        Itemsets offsets in packed array.

    Examples
    --------
    >>> indices = np.array([0, 0, 1, 0, 1, 2, 0, 1, 2, 3])
    >>> offsets = np.array([0, 1, 3, 6, 10])
    >>> mask = np.array([True, True, False, True])
    >>> prune_itemsets(indices, offsets, mask=mask, min_length=2)
    (array([0, 1, 0, 1, 2, 3]), array([0, 2, 6]))

    """

    # Build mask from length limit, if needed
    lengths = offsets[1:] - offsets[:-1]
    if min_length is not None:
        length_mask = lengths >= min_length
        if mask is None:
            mask = length_mask
        else:
            mask = np.logical_and(mask, length_mask)
    assert lengths.shape == mask.shape

    # Allocate buffers
    out_indices = np.zeros(lengths[mask].sum(), dtype=np.int32)
    out_offsets = np.zeros(mask.sum() + 1, dtype=np.int32)

    # Build new itemsets
    offset = 0
    j = 1
    for i in range(len(mask)):
        keep = mask[i]
        if keep:
            length = lengths[i]
            out_indices[offset : offset + length] = indices[offsets[i] : offsets[i + 1]]
            offset += length
            out_offsets[j] = offset
            j += 1
    return out_indices, out_offsets

Tasks#

Tasks are high-level building blocks used to define an optimization problem.

Task #

Abstract training task.

Source code in src/itembed/task.py
class Task:
    """Abstract training task."""

    def __init__(self, learning_rate_scale):
        self.learning_rate_scale = learning_rate_scale

    def do_batch(self, learning_rate):
        """Apply training step."""

        raise NotImplementedError()

do_batch #

do_batch(learning_rate)

Apply training step.

Source code in src/itembed/task.py
def do_batch(self, learning_rate):
    """Apply training step."""

    raise NotImplementedError()

UnsupervisedTask #

Bases: Task

Unsupervised training task.

See Also

:meth:do_unsupervised_steps

Parameters:

Name Type Description Default
items

Itemsets, concatenated.

required
offsets

Boundaries in packed items.

required
syn0

First set of embeddings.

required
syn1

Second set of embeddings.

required
weights

Item weights, concatenated.

None
num_negative

Number of negative samples.

5
learning_rate_scale

Learning rate multiplier.

1.0
batch_size

Batch size.

64
Source code in src/itembed/task.py
class UnsupervisedTask(Task):
    """Unsupervised training task.

    See Also
    --------
    :meth:`do_unsupervised_steps`

    Parameters
    ----------
    items: int32, num_item
        Itemsets, concatenated.
    offsets: int32, num_itemset + 1
        Boundaries in packed items.
    syn0: float32, num_label x num_dimension
        First set of embeddings.
    syn1: float32, num_label x num_dimension
        Second set of embeddings.
    weights: float32, num_item, optional
        Item weights, concatenated.
    num_negative: int32, optional
        Number of negative samples.
    learning_rate_scale: float32, optional
        Learning rate multiplier.
    batch_size: int32, optional
        Batch size.

    """

    def __init__(
        self,
        items,
        offsets,
        syn0,
        syn1,
        *,
        weights=None,
        num_negative=5,
        learning_rate_scale=1.0,
        batch_size=64,
    ):
        super().__init__(learning_rate_scale)

        # Make sure that there are no index overflow
        assert syn0.shape == syn1.shape, "synsets shape mismatch"
        assert items.min() >= 0, "negative item index"
        assert items.max() < syn0.shape[0], "out-of-bound item index"
        assert offsets.shape[0] > 1, "no itemset"
        assert offsets.min() >= 0, "negative offset"
        assert offsets.max() <= items.shape[0], "out-of-bound offset"
        assert (offsets[1:] - offsets[:-1] >= 2).all(), "itemset size must be >= 2"

        # Allocate unit weights, if needed
        if weights is None:
            weights = np.ones(items.shape[0], dtype=np.float32)
        else:
            assert weights.shape == items.shape, "weights shape mismatch"

        # Store parameters
        self.items = items
        self.weights = weights
        self.offsets = offsets
        self.syn0 = syn0
        self.syn1 = syn1
        self.num_negative = num_negative
        self.batch_size = batch_size

        # Allocate internal buffer
        size = syn0.shape[1]
        self._tmp_syn = np.empty(size, dtype=np.float32)

        # Instanciate index generator
        num_itemset = offsets.shape[0] - 1
        self.batch_iterator = index_batch_stream(num_itemset, batch_size)

    def __len__(self):
        (num_itemset,) = self.offsets.shape
        return (num_itemset - 1) // self.batch_size + 1

    def do_batch(self, learning_rate):
        indices = next(self.batch_iterator)
        do_unsupervised_batch(
            self.items,
            self.weights,
            self.offsets,
            indices,
            self.syn0,
            self.syn1,
            self._tmp_syn,
            self.num_negative,
            learning_rate * self.learning_rate_scale,
        )

SupervisedTask #

Bases: Task

Supervised training task.

See Also

:meth:do_supervised_steps

Parameters:

Name Type Description Default
left_items

Itemsets, concatenated.

required
left_offsets

Boundaries in packed items.

required
right_items

Itemsets, concatenated.

required
right_offsets

Boundaries in packed items.

required
left_syn

Feature embeddings.

required
right_syn

Label embeddings.

required
left_weights

Item weights, concatenated.

None
right_weights

Item weights, concatenated.

None
num_negative

Number of negative samples.

5
learning_rate_scale

Learning rate multiplier.

1.0
batch_size

Batch size.

64
Source code in src/itembed/task.py
class SupervisedTask(Task):
    """Supervised training task.

    See Also
    --------
    :meth:`do_supervised_steps`

    Parameters
    ----------
    left_items: int32, num_left_item
        Itemsets, concatenated.
    left_offsets: int32, num_itemset + 1
        Boundaries in packed items.
    right_items: int32, num_right_item
        Itemsets, concatenated.
    right_offsets: int32, num_itemset + 1
        Boundaries in packed items.
    left_syn: float32, num_left_label x num_dimension
        Feature embeddings.
    right_syn: float32, num_right_label x num_dimension
        Label embeddings.
    left_weights: float32, num_left_item, optional
        Item weights, concatenated.
    right_weights: float32, num_right_item, optional
        Item weights, concatenated.
    num_negative: int32, optional
        Number of negative samples.
    learning_rate_scale: float32, optional
        Learning rate multiplier.
    batch_size: int32, optional
        Batch size.

    """

    def __init__(
        self,
        left_items,
        left_offsets,
        right_items,
        right_offsets,
        left_syn,
        right_syn,
        *,
        left_weights=None,
        right_weights=None,
        num_negative=5,
        learning_rate_scale=1.0,
        batch_size=64,
    ):
        super().__init__(learning_rate_scale)

        # Make sure that there are no index overflow
        assert left_syn.shape[1] == right_syn.shape[1], "embedding size mismatch"
        assert left_items.min() >= 0, "negative item index"
        assert right_items.min() >= 0, "negative item index"
        assert left_items.max() < left_syn.shape[0], "out-of-bound item index"
        assert right_items.max() < right_syn.shape[0], "out-of-bound item index"
        assert left_offsets.shape == right_offsets.shape, "offsets shape mismatch"
        assert left_offsets.shape[0] > 1, "no itemset"
        assert right_offsets.shape[0] > 1, "no itemset"
        assert left_offsets.min() >= 0, "negative offset"
        assert right_offsets.min() >= 0, "negative offset"
        assert left_offsets.max() <= left_items.shape[0], "out-of-bound offset"
        assert right_offsets.max() <= right_items.shape[0], "out-of-bound offset"
        assert (
            left_offsets[1:] - left_offsets[:-1] >= 1
        ).all(), "itemset size must be >= 1"
        assert (
            right_offsets[1:] - right_offsets[:-1] >= 1
        ).all(), "itemset size must be >= 1"

        # Allocate unit weights, if needed
        if left_weights is None:
            left_weights = np.ones(left_items.shape[0], dtype=np.float32)
        else:
            assert left_weights.shape == left_items.shape, "weights shape mismatch"
        if right_weights is None:
            right_weights = np.ones(right_items.shape[0], dtype=np.float32)
        else:
            assert right_weights.shape == right_items.shape, "weights shape mismatch"

        # Store parameters
        self.left_items = left_items
        self.left_offsets = left_offsets
        self.left_weights = left_weights
        self.right_items = right_items
        self.right_offsets = right_offsets
        self.right_weights = right_weights
        self.left_syn = left_syn
        self.right_syn = right_syn
        self.num_negative = num_negative
        self.batch_size = batch_size

        # Allocate internal buffer
        size = left_syn.shape[1]
        self._tmp_syn = np.empty(size, dtype=np.float32)

        # Instanciate index generator
        num_itemset = left_offsets.shape[0] - 1
        self.batch_iterator = index_batch_stream(num_itemset, batch_size)

    def __len__(self):
        (num_itemset,) = self.left_offsets.shape
        return (num_itemset - 1) // self.batch_size + 1

    def do_batch(self, learning_rate):
        indices = next(self.batch_iterator)
        do_supervised_batch(
            self.left_items,
            self.left_weights,
            self.left_offsets,
            indices,
            self.right_items,
            self.right_weights,
            self.right_offsets,
            indices,
            self.left_syn,
            self.right_syn,
            self._tmp_syn,
            self.num_negative,
            learning_rate * self.learning_rate_scale,
        )

CompoundTask #

Bases: Task

Group multiple sub-tasks together.

Parameters:

Name Type Description Default
*tasks

Collection of tasks to train jointly.

()
learning_rate_scale

Learning rate multiplier.

1.0
Source code in src/itembed/task.py
class CompoundTask(Task):
    """Group multiple sub-tasks together.

    Parameters
    ----------
    *tasks: list of Task
        Collection of tasks to train jointly.
    learning_rate_scale: float32, optional
        Learning rate multiplier.

    """

    def __init__(self, *tasks, learning_rate_scale=1.0):
        super().__init__(learning_rate_scale)
        assert len(tasks) > 0
        self.tasks = tasks

    def __len__(self):
        return max(len(task) for task in self.tasks)

    def do_batch(self, learning_rate):
        learning_rate = learning_rate * self.learning_rate_scale
        for task in self.tasks:
            task.do_batch(learning_rate)

Training tools#

Embeddings initialization and training loop helpers:

initialize_syn #

initialize_syn(num_label, num_dimension, method='uniform')

Allocate and initialize embedding set.

Parameters:

Name Type Description Default
num_label

Number of labels.

required
num_dimension

Size of embeddings.

required
method

Initialization method.

'uniform'

Returns:

Name Type Description
syn float32, num_label x num_dimension

Embedding set.

Source code in src/itembed/util.py
def initialize_syn(num_label, num_dimension, method="uniform"):
    """Allocate and initialize embedding set.

    Parameters
    ----------
    num_label: int32
        Number of labels.
    num_dimension: int32
        Size of embeddings.
    method: {"uniform", "zero"}, optional
        Initialization method.

    Returns
    -------
    syn: float32, num_label x num_dimension
        Embedding set.

    """

    if method == "zero":
        syn = np.zeros((num_label, num_dimension), dtype=np.float32)
    elif method == "uniform":
        syn = np.random.rand(num_label, num_dimension).astype(np.float32)
        syn -= 0.5
        syn /= num_dimension
    else:
        raise KeyError(method)
    return syn

train #

train(task, *, num_epoch=10, initial_learning_rate=0.025, final_learning_rate=0.0)

Train loop.

Learning rate decreases linearly, down to zero.

Keyboard interruptions are silently captured, which interrupt the training process.

A progress bar is shown, using tqdm.

Parameters:

Name Type Description Default
task

Top-level task to train.

required
num_epoch

Number of passes across the whole task.

10
initial_learning_rate

Maximum learning rate (inclusive).

0.025
final_learning_rate

Minimum learning rate (exclusive).

0.0
Source code in src/itembed/util.py
def train(
    task,
    *,
    num_epoch=10,
    initial_learning_rate=0.025,
    final_learning_rate=0.0,
):
    """Train loop.

    Learning rate decreases linearly, down to zero.

    Keyboard interruptions are silently captured, which interrupt the training
    process.

    A progress bar is shown, using ``tqdm``.

    Parameters
    ----------
    task: Task
        Top-level task to train.
    num_epoch: int
        Number of passes across the whole task.
    initial_learning_rate: float
        Maximum learning rate (inclusive).
    final_learning_rate: float
        Minimum learning rate (exclusive).

    """

    # Iterate over epochs and batches, with linearly decreasing learning rate
    try:
        num_batch = len(task)
        num_step = num_epoch * num_batch
        delta_learning_rate = final_learning_rate - initial_learning_rate
        step = 0
        with tqdm(total=num_step) as progress:
            for epoch in range(num_epoch):
                for batch in range(num_batch):
                    learning_rate = (
                        delta_learning_rate * step / num_step + initial_learning_rate
                    )
                    task.do_batch(learning_rate)
                    step += 1
                    progress.update(1)

    # Allow soft interruption
    except KeyboardInterrupt:
        pass

Postprocessing tools#

Once embeddings are trained, some methods are provided to normalize and use them.

softmax #

softmax(x)

Compute softmax.

Source code in src/itembed/util.py
def softmax(x):
    """Compute softmax."""

    e = np.exp(x)
    return e / e.sum(axis=-1)[..., None]

norm #

norm(x)

L\ :sub:2 norm.

Source code in src/itembed/util.py
def norm(x):
    """L\\ :sub:`2` norm."""

    return np.sqrt((x**2).sum(axis=-1))

normalize #

normalize(x)

L\ :sub:2 normalization.

Source code in src/itembed/util.py
def normalize(x):
    """L\\ :sub:`2` normalization."""

    return x / norm(x)[..., None]

Low-level optimization methods#

At its core, itembed is a set of optimized methods.

expit #

expit(x)

Compute logistic activation.

Source code in src/itembed/optimization.py
@jit(
    float32(float32),
    nopython=True,
    nogil=True,
    fastmath=True,
)
def expit(x):
    """Compute logistic activation."""

    return 1 / (1 + math.exp(-x))

do_step #

do_step(left, right, syn_left, syn_right, tmp_syn, num_negative, learning_rate)

Apply a single training step.

Parameters:

Name Type Description Default
left

Left-hand item.

required
right

Right-hand item.

required
syn_left

Left-hand embeddings.

required
syn_right

Right-hand embeddings.

required
tmp_syn

Internal buffer (allocated only once, for performance).

required
num_negative

Number of negative samples.

required
learning_rate

Learning rate.

required
Source code in src/itembed/optimization.py
@jit(
    void(
        int32,
        int32,
        float32[:, ::1],
        float32[:, ::1],
        float32[::1],
        int32,
        float32,
    ),
    nopython=True,
    nogil=True,
    fastmath=True,
)
def do_step(
    left,
    right,
    syn_left,
    syn_right,
    tmp_syn,
    num_negative,
    learning_rate,
):
    """Apply a single training step.

    Parameters
    ----------
    left: int32
        Left-hand item.
    right: int32
        Right-hand item.
    syn_left: float32, num_left x num_dimension
        Left-hand embeddings.
    syn_right: float32, num_right x num_dimension
        Right-hand embeddings.
    tmp_syn: float32, num_dimension
        Internal buffer (allocated only once, for performance).
    num_negative: int32
        Number of negative samples.
    learning_rate: float32
        Learning rate.

    """

    num_right, num_dimension = syn_right.shape

    # Approximate softmax, by applying a single positive and many negative updates
    tmp_syn[:] = 0
    for n in range(num_negative + 1):
        if n == 0:
            target = right
            label = 1
        else:
            target = random.randint(0, num_right - 1)
            label = 0

        # Compute dot product between reference and target
        logit = np.dot(syn_left[left], syn_right[target])

        # Compute gradient scale
        gradient = (label - expit(logit)) * learning_rate

        # Accumulate gradients for left-hand embeddings
        for c in range(num_dimension):
            tmp_syn[c] += gradient * syn_right[target, c]

        # Backpropagate to right-hand embeddings
        for c in range(num_dimension):
            syn_right[target, c] += gradient * syn_left[left, c]

    # Backpropagate to left-hand embeddings
    for c in range(num_dimension):
        syn_left[left, c] += tmp_syn[c]

do_supervised_steps #

do_supervised_steps(left_itemset, right_itemset, left_weights, right_weights, left_syn, right_syn, tmp_syn, num_negative, learning_rate)

Apply steps from two itemsets.

This is used in a supervised setting, where left-hand items are features and right-hand items are labels.

Parameters:

Name Type Description Default
left_itemset

Feature items.

required
right_itemset

Label items.

required
left_weights

Feature item weights.

required
right_weights

Label item weights.

required
left_syn

Feature embeddings.

required
right_syn

Label embeddings.

required
tmp_syn

Internal buffer (allocated only once, for performance).

required
num_negative

Number of negative samples.

required
learning_rate

Learning rate.

required
Source code in src/itembed/optimization.py
@jit(
    void(
        int32[:],
        int32[:],
        float32[:],
        float32[:],
        float32[:, ::1],
        float32[:, ::1],
        float32[::1],
        int32,
        float32,
    ),
    nopython=True,
    nogil=True,
    fastmath=True,
)
def do_supervised_steps(
    left_itemset,
    right_itemset,
    left_weights,
    right_weights,
    left_syn,
    right_syn,
    tmp_syn,
    num_negative,
    learning_rate,
):
    """Apply steps from two itemsets.

    This is used in a supervised setting, where left-hand items are features
    and right-hand items are labels.

    Parameters
    ----------
    left_itemset: int32, left_length
        Feature items.
    right_itemset: int32, right_length
        Label items.
    left_weights: float32, left_length
        Feature item weights.
    right_weights: float32, right_length
        Label item weights.
    left_syn: float32, num_left_label x num_dimension
        Feature embeddings.
    right_syn: float32, num_right_label x num_dimension
        Label embeddings.
    tmp_syn: float32, num_dimension
        Internal buffer (allocated only once, for performance).
    num_negative: int32
        Number of negative samples.
    learning_rate: float32
        Learning rate.

    """

    # TODO maybe need to apply subsampling?
    # TODO possibly two passes, to garantee that each item set is fully used?
    for i in range(len(left_itemset)):
        for j in range(len(right_itemset)):
            do_step(
                left_itemset[i],
                right_itemset[j],
                left_syn,
                right_syn,
                tmp_syn,
                num_negative,
                learning_rate * left_weights[i] * right_weights[j],
            )

do_unsupervised_steps #

do_unsupervised_steps(itemset, weights, syn0, syn1, tmp_syn, num_negative, learning_rate)

Apply steps from a single itemset.

This is used in an unsupervised setting, where co-occurrence is used as a knowledge source. It follows the skip-gram method, as introduced by Mikolov et al.

For each item, a single random neighbor is sampled to define a pair. This means that only a subset of possible pairs is considered. The reason is twofold: training stays in linear complexity w.r.t. itemset lengths and large itemsets do not dominate smaller ones.

Itemset must have at least 2 items. Length is not checked, for efficiency.

Parameters:

Name Type Description Default
itemset

Items.

required
weights

Item weights.

required
syn0

First set of embeddings.

required
syn1

Second set of embeddings.

required
tmp_syn

Internal buffer (allocated only once, for performance).

required
num_negative

Number of negative samples.

required
learning_rate

Learning rate.

required
Source code in src/itembed/optimization.py
@jit(
    void(
        int32[:],
        float32[:],
        float32[:, ::1],
        float32[:, ::1],
        float32[::1],
        int32,
        float32,
    ),
    nopython=True,
    nogil=True,
    fastmath=True,
)
def do_unsupervised_steps(
    itemset,
    weights,
    syn0,
    syn1,
    tmp_syn,
    num_negative,
    learning_rate,
):
    """Apply steps from a single itemset.

    This is used in an unsupervised setting, where co-occurrence is used as a
    knowledge source. It follows the skip-gram method, as introduced by Mikolov
    et al.

    For each item, a single random neighbor is sampled to define a pair. This
    means that only a subset of possible pairs is considered. The reason is
    twofold: training stays in linear complexity w.r.t. itemset lengths and
    large itemsets do not dominate smaller ones.

    Itemset must have at least 2 items. Length is not checked, for efficiency.

    Parameters
    ----------
    itemset: int32, length
        Items.
    weights: float32, length
        Item weights.
    syn0: float32, num_label x num_dimension
        First set of embeddings.
    syn1: float32, num_label x num_dimension
        Second set of embeddings.
    tmp_syn: float32, num_dimension
        Internal buffer (allocated only once, for performance).
    num_negative: int32
        Number of negative samples.
    learning_rate: float32
        Learning rate.

    """

    num_label, num_dimension = syn0.shape
    length = itemset.shape[0]

    # For each word, update a single random neighbor
    for i in range(length):
        j = random.randint(0, length - 2)
        if j >= i:
            j += 1
        do_step(
            itemset[i],
            itemset[j],
            syn0,
            syn1,
            tmp_syn,
            num_negative,
            learning_rate * weights[i] * weights[j],
        )

do_supervised_batch #

do_supervised_batch(left_items, left_weights, left_offsets, left_indices, right_items, right_weights, right_offsets, right_indices, left_syn, right_syn, tmp_syn, num_negative, learning_rate)

Apply supervised steps from multiple itemsets.

See Also

:meth:do_supervised_steps

Parameters:

Name Type Description Default
left_items

Itemsets, concatenated.

required
left_weights

Item weights, concatenated.

required
left_offsets

Boundaries in packed items.

required
left_indices

Subset of offsets to consider.

required
right_items

Itemsets, concatenated.

required
right_weights

Item weights, concatenated.

required
right_offsets

Boundaries in packed items.

required
right_indices

Subset of offsets to consider.

required
left_syn

Feature embeddings.

required
right_syn

Label embeddings.

required
tmp_syn

Internal buffer (allocated only once, for performance).

required
num_negative

Number of negative samples.

required
learning_rate

Learning rate.

required
Source code in src/itembed/optimization.py
@jit(
    void(
        int32[:],
        float32[:],
        int32[:],
        int32[:],
        int32[:],
        float32[:],
        int32[:],
        int32[:],
        float32[:, ::1],
        float32[:, ::1],
        float32[::1],
        int32,
        float32,
    ),
    nopython=True,
    nogil=True,
    fastmath=True,
)
def do_supervised_batch(
    left_items,
    left_weights,
    left_offsets,
    left_indices,
    right_items,
    right_weights,
    right_offsets,
    right_indices,
    left_syn,
    right_syn,
    tmp_syn,
    num_negative,
    learning_rate,
):
    """Apply supervised steps from multiple itemsets.

    See Also
    --------
    :meth:`do_supervised_steps`

    Parameters
    ----------
    left_items: int32, num_left_item
        Itemsets, concatenated.
    left_weights: float32, num_left_item
        Item weights, concatenated.
    left_offsets: int32, num_itemset + 1
        Boundaries in packed items.
    left_indices: int32, num_step
        Subset of offsets to consider.
    right_items: int32, num_right_item
        Itemsets, concatenated.
    right_weights: float32, num_right_item
        Item weights, concatenated.
    right_offsets: int32, num_itemset + 1
        Boundaries in packed items.
    right_indices: int32, num_step
        Subset of offsets to consider.
    left_syn: float32, num_left_label x num_dimension
        Feature embeddings.
    right_syn: float32, num_right_label x num_dimension
        Label embeddings.
    tmp_syn: float32, num_dimension
        Internal buffer (allocated only once, for performance).
    num_negative: int32
        Number of negative samples.
    learning_rate: float32
        Learning rate.

    """

    length = left_indices.shape[0]
    for i in range(length):
        j = left_indices[i]
        k = right_indices[i]
        do_supervised_steps(
            left_items[left_offsets[j] : left_offsets[j + 1]],
            right_items[right_offsets[k] : right_offsets[k + 1]],
            left_weights[left_offsets[j] : left_offsets[j + 1]],
            right_weights[right_offsets[k] : right_offsets[k + 1]],
            left_syn,
            right_syn,
            tmp_syn,
            num_negative,
            learning_rate,
        )

do_unsupervised_batch #

do_unsupervised_batch(items, weights, offsets, indices, syn0, syn1, tmp_syn, num_negative, learning_rate)

Apply unsupervised steps from multiple itemsets.

See Also

:meth:do_unsupervised_steps

Parameters:

Name Type Description Default
items

Itemsets, concatenated.

required
weights

Item weights, concatenated.

required
offsets

Boundaries in packed items.

required
indices

Subset of offsets to consider.

required
syn0

First set of embeddings.

required
syn1

Second set of embeddings.

required
tmp_syn

Internal buffer (allocated only once, for performance).

required
num_negative

Number of negative samples.

required
learning_rate

Learning rate.

required
Source code in src/itembed/optimization.py
@jit(
    void(
        int32[:],
        float32[:],
        int32[:],
        int32[:],
        float32[:, ::1],
        float32[:, ::1],
        float32[::1],
        int32,
        float32,
    ),
    nopython=True,
    nogil=True,
    fastmath=True,
)
def do_unsupervised_batch(
    items,
    weights,
    offsets,
    indices,
    syn0,
    syn1,
    tmp_syn,
    num_negative,
    learning_rate,
):
    """Apply unsupervised steps from multiple itemsets.

    See Also
    --------
    :meth:`do_unsupervised_steps`

    Parameters
    ----------
    items: int32, num_item
        Itemsets, concatenated.
    weights: float32, num_item
        Item weights, concatenated.
    offsets: int32, num_itemset + 1
        Boundaries in packed items.
    indices: int32, num_step
        Subset of offsets to consider.
    syn0: float32, num_label x num_dimension
        First set of embeddings.
    syn1: float32, num_label x num_dimension
        Second set of embeddings.
    tmp_syn: float32, num_dimension
        Internal buffer (allocated only once, for performance).
    num_negative: int32
        Number of negative samples.
    learning_rate: float32
        Learning rate.

    """

    for i in indices:
        do_unsupervised_steps(
            items[offsets[i] : offsets[i + 1]],
            weights[offsets[i] : offsets[i + 1]],
            syn0,
            syn1,
            tmp_syn,
            num_negative,
            learning_rate,
        )