Reference - itembed

index_batch_stream #

index_batch_stream(num_index, batch_size)

Indices generator.

Source code in src/itembed/util.py

def index_batch_stream(num_index, batch_size):
    """Indices generator."""

    assert num_index > 0
    assert batch_size > 0

    indices = np.arange(num_index, dtype=np.int32)
    # TODO improve this algorithm

    # Replicate indices, to cover at least one batch
    if num_index < batch_size:
        repeat = batch_size // num_index + 1
        indices = np.tile(indices, repeat)
        num_index *= repeat

    # Loop forever
    while True:
        np.random.shuffle(indices)
        i = 0
        while i + batch_size <= num_index:
            yield indices[i : i + batch_size]
            i += batch_size

pack_itemsets #

pack_itemsets(itemsets, *, min_count=1, min_length=1)

Convert itemset collection to packed indices.

Parameters:

Name	Description	Default
`itemsets`	List of sets of hashable objects.	required
`min_count`	Minimal frequency count to be kept.	`1`
`min_length`	Minimal itemset length.	`1`

Returns:

Name	Type	Description
`labels`	`list of object`	Mapping from indices to labels.
`indices`	`(int32, num_item)`	Packed index array.
`offsets`	`(int32, num_itemset + 1)`	Itemsets offsets in packed array.

Examples:

>>> itemsets = [
...     ["apple"],
...     ["apple", "sugar", "flour"],
...     ["pear", "sugar", "flour", "butter"],
...     ["apple", "pear", "sugar", "butter", "cinnamon"],
...     ["salt", "flour", "oil"],
... ]
>>> pack_itemsets(itemsets, min_length=2)
(['apple', 'sugar', 'flour', 'pear', 'butter', 'cinnamon', 'salt', 'oil'],
 array([0, 1, 2, 3, 1, 2, 4, 0, 3, 1, 4, 5, 6, 2, 7]),
 array([ 0,  3,  7, 12, 15]))

Source code in src/itembed/util.py

def pack_itemsets(itemsets, *, min_count=1, min_length=1):
    """Convert itemset collection to packed indices.

    Parameters
    ----------
    itemsets: list of list of object
        List of sets of hashable objects.
    min_count: int, optional
        Minimal frequency count to be kept.
    min_length: int, optional
        Minimal itemset length.

    Returns
    -------
    labels: list of object
        Mapping from indices to labels.
    indices: int32, num_item
        Packed index array.
    offsets: int32, num_itemset + 1
        Itemsets offsets in packed array.

    Examples
    --------
    >>> itemsets = [
    ...     ["apple"],
    ...     ["apple", "sugar", "flour"],
    ...     ["pear", "sugar", "flour", "butter"],
    ...     ["apple", "pear", "sugar", "butter", "cinnamon"],
    ...     ["salt", "flour", "oil"],
    ... ]
    >>> pack_itemsets(itemsets, min_length=2)
    (['apple', 'sugar', 'flour', 'pear', 'butter', 'cinnamon', 'salt', 'oil'],
     array([0, 1, 2, 3, 1, 2, 4, 0, 3, 1, 4, 5, 6, 2, 7]),
     array([ 0,  3,  7, 12, 15]))

    """

    # Count labels
    counter = Counter()
    for itemset in itemsets:
        counter.update(itemset)
    if None in counter:
        del counter[None]

    # Define label list
    labels = [l for l, c in counter.most_common() if c >= min_count]
    label_map = {l: i for i, l in enumerate(labels)}

    # Generate indices
    indices = []
    offsets = [0]
    for itemset in itemsets:
        itemset_indices = []
        for label in itemset:
            index = label_map.get(label)
            if index is not None:
                itemset_indices.append(index)
        if len(itemset_indices) >= min_length:
            indices.extend(itemset_indices)
            offsets.append(len(indices))

    # Convert to arrays
    indices = np.array(indices, dtype=np.int32)
    offsets = np.array(offsets, dtype=np.int32)
    return labels, indices, offsets

prune_itemsets #

prune_itemsets(indices, offsets, *, mask=None, min_length=None)

Filter packed indices.

Either an explicit mask or a length threshold must be defined.

Parameters:

Name	Description	Default
`indices`	Packed index array.	required
`offsets`	Itemsets offsets in packed array.	required
`mask`	Boolean mask.	`None`
`min_length`	Minimum length, inclusive.	`None`

Returns:

Name	Type	Description
`indices`	`(int32, num_item)`	Packed index array.
`offsets`	`(int32, num_itemset + 1)`	Itemsets offsets in packed array.

Examples:

>>> indices = np.array([0, 0, 1, 0, 1, 2, 0, 1, 2, 3])
>>> offsets = np.array([0, 1, 3, 6, 10])
>>> mask = np.array([True, True, False, True])
>>> prune_itemsets(indices, offsets, mask=mask, min_length=2)
(array([0, 1, 0, 1, 2, 3]), array([0, 2, 6]))

Source code in src/itembed/util.py

def prune_itemsets(indices, offsets, *, mask=None, min_length=None):
    """Filter packed indices.

    Either an explicit mask or a length threshold must be defined.

    Parameters
    ----------
    indices: int32, num_item
        Packed index array.
    offsets: int32, num_itemset + 1
        Itemsets offsets in packed array.
    mask: bool, num_itemset
        Boolean mask.
    min_length: int
        Minimum length, inclusive.

    Returns
    -------
    indices: int32, num_item
        Packed index array.
    offsets: int32, num_itemset + 1
        Itemsets offsets in packed array.

    Examples
    --------
    >>> indices = np.array([0, 0, 1, 0, 1, 2, 0, 1, 2, 3])
    >>> offsets = np.array([0, 1, 3, 6, 10])
    >>> mask = np.array([True, True, False, True])
    >>> prune_itemsets(indices, offsets, mask=mask, min_length=2)
    (array([0, 1, 0, 1, 2, 3]), array([0, 2, 6]))

    """

    # Build mask from length limit, if needed
    lengths = offsets[1:] - offsets[:-1]
    if min_length is not None:
        length_mask = lengths >= min_length
        if mask is None:
            mask = length_mask
        else:
            mask = np.logical_and(mask, length_mask)
    assert lengths.shape == mask.shape

    # Allocate buffers
    out_indices = np.zeros(lengths[mask].sum(), dtype=np.int32)
    out_offsets = np.zeros(mask.sum() + 1, dtype=np.int32)

    # Build new itemsets
    offset = 0
    j = 1
    for i in range(len(mask)):
        keep = mask[i]
        if keep:
            length = lengths[i]
            out_indices[offset : offset + length] = indices[offsets[i] : offsets[i + 1]]
            offset += length
            out_offsets[j] = offset
            j += 1
    return out_indices, out_offsets

Task #

Abstract training task.

Source code in src/itembed/task.py

class Task:
    """Abstract training task."""

    def __init__(self, learning_rate_scale):
        self.learning_rate_scale = learning_rate_scale

    def do_batch(self, learning_rate):
        """Apply training step."""

        raise NotImplementedError()

do_batch #

do_batch(learning_rate)

Apply training step.

Source code in src/itembed/task.py

def do_batch(self, learning_rate):
    """Apply training step."""

    raise NotImplementedError()

UnsupervisedTask #

Bases: Task

Unsupervised training task.

See Also

:meth:do_unsupervised_steps

Parameters:

Name	Description	Default
`items`	Itemsets, concatenated.	required
`offsets`	Boundaries in packed items.	required
`syn0`	First set of embeddings.	required
`syn1`	Second set of embeddings.	required
`weights`	Item weights, concatenated.	`None`
`num_negative`	Number of negative samples.	`5`
`learning_rate_scale`	Learning rate multiplier.	`1.0`
`batch_size`	Batch size.	`64`

Source code in src/itembed/task.py

class UnsupervisedTask(Task):
    """Unsupervised training task.

    See Also
    --------
    :meth:`do_unsupervised_steps`

    Parameters
    ----------
    items: int32, num_item
        Itemsets, concatenated.
    offsets: int32, num_itemset + 1
        Boundaries in packed items.
    syn0: float32, num_label x num_dimension
        First set of embeddings.
    syn1: float32, num_label x num_dimension
        Second set of embeddings.
    weights: float32, num_item, optional
        Item weights, concatenated.
    num_negative: int32, optional
        Number of negative samples.
    learning_rate_scale: float32, optional
        Learning rate multiplier.
    batch_size: int32, optional
        Batch size.

    """

    def __init__(
        self,
        items,
        offsets,
        syn0,
        syn1,
        *,
        weights=None,
        num_negative=5,
        learning_rate_scale=1.0,
        batch_size=64,
    ):
        super().__init__(learning_rate_scale)

        # Make sure that there are no index overflow
        assert syn0.shape == syn1.shape, "synsets shape mismatch"
        assert items.min() >= 0, "negative item index"
        assert items.max() < syn0.shape[0], "out-of-bound item index"
        assert offsets.shape[0] > 1, "no itemset"
        assert offsets.min() >= 0, "negative offset"
        assert offsets.max() <= items.shape[0], "out-of-bound offset"
        assert (offsets[1:] - offsets[:-1] >= 2).all(), "itemset size must be >= 2"

        # Allocate unit weights, if needed
        if weights is None:
            weights = np.ones(items.shape[0], dtype=np.float32)
        else:
            assert weights.shape == items.shape, "weights shape mismatch"

        # Store parameters
        self.items = items
        self.weights = weights
        self.offsets = offsets
        self.syn0 = syn0
        self.syn1 = syn1
        self.num_negative = num_negative
        self.batch_size = batch_size

        # Allocate internal buffer
        size = syn0.shape[1]
        self._tmp_syn = np.empty(size, dtype=np.float32)

        # Instanciate index generator
        num_itemset = offsets.shape[0] - 1
        self.batch_iterator = index_batch_stream(num_itemset, batch_size)

    def __len__(self):
        (num_itemset,) = self.offsets.shape
        return (num_itemset - 1) // self.batch_size + 1

    def do_batch(self, learning_rate):
        indices = next(self.batch_iterator)
        do_unsupervised_batch(
            self.items,
            self.weights,
            self.offsets,
            indices,
            self.syn0,
            self.syn1,
            self._tmp_syn,
            self.num_negative,
            learning_rate * self.learning_rate_scale,
        )

SupervisedTask #

Bases: Task

Supervised training task.

See Also

:meth:do_supervised_steps

Parameters:

Name	Description	Default
`left_items`	Itemsets, concatenated.	required
`left_offsets`	Boundaries in packed items.	required
`right_items`	Itemsets, concatenated.	required
`right_offsets`	Boundaries in packed items.	required
`left_syn`	Feature embeddings.	required
`right_syn`	Label embeddings.	required
`left_weights`	Item weights, concatenated.	`None`
`right_weights`	Item weights, concatenated.	`None`
`num_negative`	Number of negative samples.	`5`
`learning_rate_scale`	Learning rate multiplier.	`1.0`
`batch_size`	Batch size.	`64`

Source code in src/itembed/task.py

class SupervisedTask(Task):
    """Supervised training task.

    See Also
    --------
    :meth:`do_supervised_steps`

    Parameters
    ----------
    left_items: int32, num_left_item
        Itemsets, concatenated.
    left_offsets: int32, num_itemset + 1
        Boundaries in packed items.
    right_items: int32, num_right_item
        Itemsets, concatenated.
    right_offsets: int32, num_itemset + 1
        Boundaries in packed items.
    left_syn: float32, num_left_label x num_dimension
        Feature embeddings.
    right_syn: float32, num_right_label x num_dimension
        Label embeddings.
    left_weights: float32, num_left_item, optional
        Item weights, concatenated.
    right_weights: float32, num_right_item, optional
        Item weights, concatenated.
    num_negative: int32, optional
        Number of negative samples.
    learning_rate_scale: float32, optional
        Learning rate multiplier.
    batch_size: int32, optional
        Batch size.

    """

    def __init__(
        self,
        left_items,
        left_offsets,
        right_items,
        right_offsets,
        left_syn,
        right_syn,
        *,
        left_weights=None,
        right_weights=None,
        num_negative=5,
        learning_rate_scale=1.0,
        batch_size=64,
    ):
        super().__init__(learning_rate_scale)

        # Make sure that there are no index overflow
        assert left_syn.shape[1] == right_syn.shape[1], "embedding size mismatch"
        assert left_items.min() >= 0, "negative item index"
        assert right_items.min() >= 0, "negative item index"
        assert left_items.max() < left_syn.shape[0], "out-of-bound item index"
        assert right_items.max() < right_syn.shape[0], "out-of-bound item index"
        assert left_offsets.shape == right_offsets.shape, "offsets shape mismatch"
        assert left_offsets.shape[0] > 1, "no itemset"
        assert right_offsets.shape[0] > 1, "no itemset"
        assert left_offsets.min() >= 0, "negative offset"
        assert right_offsets.min() >= 0, "negative offset"
        assert left_offsets.max() <= left_items.shape[0], "out-of-bound offset"
        assert right_offsets.max() <= right_items.shape[0], "out-of-bound offset"
        assert (
            left_offsets[1:] - left_offsets[:-1] >= 1
        ).all(), "itemset size must be >= 1"
        assert (
            right_offsets[1:] - right_offsets[:-1] >= 1
        ).all(), "itemset size must be >= 1"

        # Allocate unit weights, if needed
        if left_weights is None:
            left_weights = np.ones(left_items.shape[0], dtype=np.float32)
        else:
            assert left_weights.shape == left_items.shape, "weights shape mismatch"
        if right_weights is None:
            right_weights = np.ones(right_items.shape[0], dtype=np.float32)
        else:
            assert right_weights.shape == right_items.shape, "weights shape mismatch"

        # Store parameters
        self.left_items = left_items
        self.left_offsets = left_offsets
        self.left_weights = left_weights
        self.right_items = right_items
        self.right_offsets = right_offsets
        self.right_weights = right_weights
        self.left_syn = left_syn
        self.right_syn = right_syn
        self.num_negative = num_negative
        self.batch_size = batch_size

        # Allocate internal buffer
        size = left_syn.shape[1]
        self._tmp_syn = np.empty(size, dtype=np.float32)

        # Instanciate index generator
        num_itemset = left_offsets.shape[0] - 1
        self.batch_iterator = index_batch_stream(num_itemset, batch_size)

    def __len__(self):
        (num_itemset,) = self.left_offsets.shape
        return (num_itemset - 1) // self.batch_size + 1

    def do_batch(self, learning_rate):
        indices = next(self.batch_iterator)
        do_supervised_batch(
            self.left_items,
            self.left_weights,
            self.left_offsets,
            indices,
            self.right_items,
            self.right_weights,
            self.right_offsets,
            indices,
            self.left_syn,
            self.right_syn,
            self._tmp_syn,
            self.num_negative,
            learning_rate * self.learning_rate_scale,
        )

CompoundTask #

Bases: Task

Group multiple sub-tasks together.

Parameters:

Name	Type	Description	Default
`*tasks`		Collection of tasks to train jointly.	`()`
`learning_rate_scale`		Learning rate multiplier.	`1.0`

Source code in src/itembed/task.py

class CompoundTask(Task):
    """Group multiple sub-tasks together.

    Parameters
    ----------
    *tasks: list of Task
        Collection of tasks to train jointly.
    learning_rate_scale: float32, optional
        Learning rate multiplier.

    """

    def __init__(self, *tasks, learning_rate_scale=1.0):
        super().__init__(learning_rate_scale)
        assert len(tasks) > 0
        self.tasks = tasks

    def __len__(self):
        return max(len(task) for task in self.tasks)

    def do_batch(self, learning_rate):
        learning_rate = learning_rate * self.learning_rate_scale
        for task in self.tasks:
            task.do_batch(learning_rate)

initialize_syn #

initialize_syn(num_label, num_dimension, method='uniform')

Allocate and initialize embedding set.

Parameters:

Name	Description	Default
`num_label`	Number of labels.	required
`num_dimension`	Size of embeddings.	required
`method`	Initialization method.	`'uniform'`

Returns:

Name	Type	Description
`syn`	`float32, num_label x num_dimension`	Embedding set.

Source code in src/itembed/util.py

def initialize_syn(num_label, num_dimension, method="uniform"):
    """Allocate and initialize embedding set.

    Parameters
    ----------
    num_label: int32
        Number of labels.
    num_dimension: int32
        Size of embeddings.
    method: {"uniform", "zero"}, optional
        Initialization method.

    Returns
    -------
    syn: float32, num_label x num_dimension
        Embedding set.

    """

    if method == "zero":
        syn = np.zeros((num_label, num_dimension), dtype=np.float32)
    elif method == "uniform":
        syn = np.random.rand(num_label, num_dimension).astype(np.float32)
        syn -= 0.5
        syn /= num_dimension
    else:
        raise KeyError(method)
    return syn

train #

train(task, *, num_epoch=10, initial_learning_rate=0.025, final_learning_rate=0.0)

Train loop.

Learning rate decreases linearly, down to zero.

Keyboard interruptions are silently captured, which interrupt the training process.

A progress bar is shown, using tqdm.

Parameters:

Name	Description	Default
`task`	Top-level task to train.	required
`num_epoch`	Number of passes across the whole task.	`10`
`initial_learning_rate`	Maximum learning rate (inclusive).	`0.025`
`final_learning_rate`	Minimum learning rate (exclusive).	`0.0`

Source code in src/itembed/util.py

def train(
    task,
    *,
    num_epoch=10,
    initial_learning_rate=0.025,
    final_learning_rate=0.0,
):
    """Train loop.

    Learning rate decreases linearly, down to zero.

    Keyboard interruptions are silently captured, which interrupt the training
    process.

    A progress bar is shown, using ``tqdm``.

    Parameters
    ----------
    task: Task
        Top-level task to train.
    num_epoch: int
        Number of passes across the whole task.
    initial_learning_rate: float
        Maximum learning rate (inclusive).
    final_learning_rate: float
        Minimum learning rate (exclusive).

    """

    # Iterate over epochs and batches, with linearly decreasing learning rate
    try:
        num_batch = len(task)
        num_step = num_epoch * num_batch
        delta_learning_rate = final_learning_rate - initial_learning_rate
        step = 0
        with tqdm(total=num_step) as progress:
            for epoch in range(num_epoch):
                for batch in range(num_batch):
                    learning_rate = (
                        delta_learning_rate * step / num_step + initial_learning_rate
                    )
                    task.do_batch(learning_rate)
                    step += 1
                    progress.update(1)

    # Allow soft interruption
    except KeyboardInterrupt:
        pass

softmax #

softmax(x)

Compute softmax.

Source code in src/itembed/util.py

def softmax(x):
    """Compute softmax."""

    e = np.exp(x)
    return e / e.sum(axis=-1)[..., None]

norm #

norm(x)

L\ :sub:2 norm.

Source code in src/itembed/util.py

def norm(x):
    """L\\ :sub:`2` norm."""

    return np.sqrt((x**2).sum(axis=-1))

normalize #

normalize(x)

L\ :sub:2 normalization.

Source code in src/itembed/util.py

def normalize(x):
    """L\\ :sub:`2` normalization."""

    return x / norm(x)[..., None]

expit #

expit(x)

Compute logistic activation.

Source code in src/itembed/optimization.py

@jit(
    float32(float32),
    nopython=True,
    nogil=True,
    fastmath=True,
)
def expit(x):
    """Compute logistic activation."""

    return 1 / (1 + math.exp(-x))

do_step #

do_step(left, right, syn_left, syn_right, tmp_syn, num_negative, learning_rate)

Apply a single training step.

Parameters:

Name	Description	Default
`left`	Left-hand item.	required
`right`	Right-hand item.	required
`syn_left`	Left-hand embeddings.	required
`syn_right`	Right-hand embeddings.	required
`tmp_syn`	Internal buffer (allocated only once, for performance).	required
`num_negative`	Number of negative samples.	required
`learning_rate`	Learning rate.	required

Source code in src/itembed/optimization.py

@jit(
    void(
        int32,
        int32,
        float32[:, ::1],
        float32[:, ::1],
        float32[::1],
        int32,
        float32,
    ),
    nopython=True,
    nogil=True,
    fastmath=True,
)
def do_step(
    left,
    right,
    syn_left,
    syn_right,
    tmp_syn,
    num_negative,
    learning_rate,
):
    """Apply a single training step.

    Parameters
    ----------
    left: int32
        Left-hand item.
    right: int32
        Right-hand item.
    syn_left: float32, num_left x num_dimension
        Left-hand embeddings.
    syn_right: float32, num_right x num_dimension
        Right-hand embeddings.
    tmp_syn: float32, num_dimension
        Internal buffer (allocated only once, for performance).
    num_negative: int32
        Number of negative samples.
    learning_rate: float32
        Learning rate.

    """

    num_right, num_dimension = syn_right.shape

    # Approximate softmax, by applying a single positive and many negative updates
    tmp_syn[:] = 0
    for n in range(num_negative + 1):
        if n == 0:
            target = right
            label = 1
        else:
            target = random.randint(0, num_right - 1)
            label = 0

        # Compute dot product between reference and target
        logit = np.dot(syn_left[left], syn_right[target])

        # Compute gradient scale
        gradient = (label - expit(logit)) * learning_rate

        # Accumulate gradients for left-hand embeddings
        for c in range(num_dimension):
            tmp_syn[c] += gradient * syn_right[target, c]

        # Backpropagate to right-hand embeddings
        for c in range(num_dimension):
            syn_right[target, c] += gradient * syn_left[left, c]

    # Backpropagate to left-hand embeddings
    for c in range(num_dimension):
        syn_left[left, c] += tmp_syn[c]

do_supervised_steps #

do_supervised_steps(left_itemset, right_itemset, left_weights, right_weights, left_syn, right_syn, tmp_syn, num_negative, learning_rate)

Apply steps from two itemsets.

This is used in a supervised setting, where left-hand items are features and right-hand items are labels.

Parameters:

Name	Description	Default
`left_itemset`	Feature items.	required
`right_itemset`	Label items.	required
`left_weights`	Feature item weights.	required
`right_weights`	Label item weights.	required
`left_syn`	Feature embeddings.	required
`right_syn`	Label embeddings.	required
`tmp_syn`	Internal buffer (allocated only once, for performance).	required
`num_negative`	Number of negative samples.	required
`learning_rate`	Learning rate.	required

Source code in src/itembed/optimization.py

@jit(
    void(
        int32[:],
        int32[:],
        float32[:],
        float32[:],
        float32[:, ::1],
        float32[:, ::1],
        float32[::1],
        int32,
        float32,
    ),
    nopython=True,
    nogil=True,
    fastmath=True,
)
def do_supervised_steps(
    left_itemset,
    right_itemset,
    left_weights,
    right_weights,
    left_syn,
    right_syn,
    tmp_syn,
    num_negative,
    learning_rate,
):
    """Apply steps from two itemsets.

    This is used in a supervised setting, where left-hand items are features
    and right-hand items are labels.

    Parameters
    ----------
    left_itemset: int32, left_length
        Feature items.
    right_itemset: int32, right_length
        Label items.
    left_weights: float32, left_length
        Feature item weights.
    right_weights: float32, right_length
        Label item weights.
    left_syn: float32, num_left_label x num_dimension
        Feature embeddings.
    right_syn: float32, num_right_label x num_dimension
        Label embeddings.
    tmp_syn: float32, num_dimension
        Internal buffer (allocated only once, for performance).
    num_negative: int32
        Number of negative samples.
    learning_rate: float32
        Learning rate.

    """

    # TODO maybe need to apply subsampling?
    # TODO possibly two passes, to garantee that each item set is fully used?
    for i in range(len(left_itemset)):
        for j in range(len(right_itemset)):
            do_step(
                left_itemset[i],
                right_itemset[j],
                left_syn,
                right_syn,
                tmp_syn,
                num_negative,
                learning_rate * left_weights[i] * right_weights[j],
            )

do_unsupervised_steps #

do_unsupervised_steps(itemset, weights, syn0, syn1, tmp_syn, num_negative, learning_rate)

Apply steps from a single itemset.

This is used in an unsupervised setting, where co-occurrence is used as a knowledge source. It follows the skip-gram method, as introduced by Mikolov et al.

For each item, a single random neighbor is sampled to define a pair. This means that only a subset of possible pairs is considered. The reason is twofold: training stays in linear complexity w.r.t. itemset lengths and large itemsets do not dominate smaller ones.

Itemset must have at least 2 items. Length is not checked, for efficiency.

Parameters:

Name	Description	Default
`itemset`	Items.	required
`weights`	Item weights.	required
`syn0`	First set of embeddings.	required
`syn1`	Second set of embeddings.	required
`tmp_syn`	Internal buffer (allocated only once, for performance).	required
`num_negative`	Number of negative samples.	required
`learning_rate`	Learning rate.	required

Source code in src/itembed/optimization.py

@jit(
    void(
        int32[:],
        float32[:],
        float32[:, ::1],
        float32[:, ::1],
        float32[::1],
        int32,
        float32,
    ),
    nopython=True,
    nogil=True,
    fastmath=True,
)
def do_unsupervised_steps(
    itemset,
    weights,
    syn0,
    syn1,
    tmp_syn,
    num_negative,
    learning_rate,
):
    """Apply steps from a single itemset.

    This is used in an unsupervised setting, where co-occurrence is used as a
    knowledge source. It follows the skip-gram method, as introduced by Mikolov
    et al.

    For each item, a single random neighbor is sampled to define a pair. This
    means that only a subset of possible pairs is considered. The reason is
    twofold: training stays in linear complexity w.r.t. itemset lengths and
    large itemsets do not dominate smaller ones.

    Itemset must have at least 2 items. Length is not checked, for efficiency.

    Parameters
    ----------
    itemset: int32, length
        Items.
    weights: float32, length
        Item weights.
    syn0: float32, num_label x num_dimension
        First set of embeddings.
    syn1: float32, num_label x num_dimension
        Second set of embeddings.
    tmp_syn: float32, num_dimension
        Internal buffer (allocated only once, for performance).
    num_negative: int32
        Number of negative samples.
    learning_rate: float32
        Learning rate.

    """

    num_label, num_dimension = syn0.shape
    length = itemset.shape[0]

    # For each word, update a single random neighbor
    for i in range(length):
        j = random.randint(0, length - 2)
        if j >= i:
            j += 1
        do_step(
            itemset[i],
            itemset[j],
            syn0,
            syn1,
            tmp_syn,
            num_negative,
            learning_rate * weights[i] * weights[j],
        )

do_supervised_batch #

do_supervised_batch(left_items, left_weights, left_offsets, left_indices, right_items, right_weights, right_offsets, right_indices, left_syn, right_syn, tmp_syn, num_negative, learning_rate)

Apply supervised steps from multiple itemsets.

See Also

:meth:do_supervised_steps

Parameters:

Name	Description	Default
`left_items`	Itemsets, concatenated.	required
`left_weights`	Item weights, concatenated.	required
`left_offsets`	Boundaries in packed items.	required
`left_indices`	Subset of offsets to consider.	required
`right_items`	Itemsets, concatenated.	required
`right_weights`	Item weights, concatenated.	required
`right_offsets`	Boundaries in packed items.	required
`right_indices`	Subset of offsets to consider.	required
`left_syn`	Feature embeddings.	required
`right_syn`	Label embeddings.	required
`tmp_syn`	Internal buffer (allocated only once, for performance).	required
`num_negative`	Number of negative samples.	required
`learning_rate`	Learning rate.	required

Source code in src/itembed/optimization.py

@jit(
    void(
        int32[:],
        float32[:],
        int32[:],
        int32[:],
        int32[:],
        float32[:],
        int32[:],
        int32[:],
        float32[:, ::1],
        float32[:, ::1],
        float32[::1],
        int32,
        float32,
    ),
    nopython=True,
    nogil=True,
    fastmath=True,
)
def do_supervised_batch(
    left_items,
    left_weights,
    left_offsets,
    left_indices,
    right_items,
    right_weights,
    right_offsets,
    right_indices,
    left_syn,
    right_syn,
    tmp_syn,
    num_negative,
    learning_rate,
):
    """Apply supervised steps from multiple itemsets.

    See Also
    --------
    :meth:`do_supervised_steps`

    Parameters
    ----------
    left_items: int32, num_left_item
        Itemsets, concatenated.
    left_weights: float32, num_left_item
        Item weights, concatenated.
    left_offsets: int32, num_itemset + 1
        Boundaries in packed items.
    left_indices: int32, num_step
        Subset of offsets to consider.
    right_items: int32, num_right_item
        Itemsets, concatenated.
    right_weights: float32, num_right_item
        Item weights, concatenated.
    right_offsets: int32, num_itemset + 1
        Boundaries in packed items.
    right_indices: int32, num_step
        Subset of offsets to consider.
    left_syn: float32, num_left_label x num_dimension
        Feature embeddings.
    right_syn: float32, num_right_label x num_dimension
        Label embeddings.
    tmp_syn: float32, num_dimension
        Internal buffer (allocated only once, for performance).
    num_negative: int32
        Number of negative samples.
    learning_rate: float32
        Learning rate.

    """

    length = left_indices.shape[0]
    for i in range(length):
        j = left_indices[i]
        k = right_indices[i]
        do_supervised_steps(
            left_items[left_offsets[j] : left_offsets[j + 1]],
            right_items[right_offsets[k] : right_offsets[k + 1]],
            left_weights[left_offsets[j] : left_offsets[j + 1]],
            right_weights[right_offsets[k] : right_offsets[k + 1]],
            left_syn,
            right_syn,
            tmp_syn,
            num_negative,
            learning_rate,
        )

do_unsupervised_batch #

do_unsupervised_batch(items, weights, offsets, indices, syn0, syn1, tmp_syn, num_negative, learning_rate)

Apply unsupervised steps from multiple itemsets.

See Also

:meth:do_unsupervised_steps

Parameters:

Name	Description	Default
`items`	Itemsets, concatenated.	required
`weights`	Item weights, concatenated.	required
`offsets`	Boundaries in packed items.	required
`indices`	Subset of offsets to consider.	required
`syn0`	First set of embeddings.	required
`syn1`	Second set of embeddings.	required
`tmp_syn`	Internal buffer (allocated only once, for performance).	required
`num_negative`	Number of negative samples.	required
`learning_rate`	Learning rate.	required

Source code in src/itembed/optimization.py

@jit(
    void(
        int32[:],
        float32[:],
        int32[:],
        int32[:],
        float32[:, ::1],
        float32[:, ::1],
        float32[::1],
        int32,
        float32,
    ),
    nopython=True,
    nogil=True,
    fastmath=True,
)
def do_unsupervised_batch(
    items,
    weights,
    offsets,
    indices,
    syn0,
    syn1,
    tmp_syn,
    num_negative,
    learning_rate,
):
    """Apply unsupervised steps from multiple itemsets.

    See Also
    --------
    :meth:`do_unsupervised_steps`

    Parameters
    ----------
    items: int32, num_item
        Itemsets, concatenated.
    weights: float32, num_item
        Item weights, concatenated.
    offsets: int32, num_itemset + 1
        Boundaries in packed items.
    indices: int32, num_step
        Subset of offsets to consider.
    syn0: float32, num_label x num_dimension
        First set of embeddings.
    syn1: float32, num_label x num_dimension
        Second set of embeddings.
    tmp_syn: float32, num_dimension
        Internal buffer (allocated only once, for performance).
    num_negative: int32
        Number of negative samples.
    learning_rate: float32
        Learning rate.

    """

    for i in indices:
        do_unsupervised_steps(
            items[offsets[i] : offsets[i + 1]],
            weights[offsets[i] : offsets[i + 1]],
            syn0,
            syn1,
            tmp_syn,
            num_negative,
            learning_rate,
        )

Reference#

Preprocessing tools#

index_batch_stream #

pack_itemsets #

prune_itemsets #

Tasks#

Task #

do_batch #

UnsupervisedTask #

SupervisedTask #

CompoundTask #

Training tools#

initialize_syn #

train #

Postprocessing tools#

softmax #

norm #

normalize #

Low-level optimization methods#

expit #

do_step #

do_supervised_steps #

do_unsupervised_steps #

do_supervised_batch #

do_unsupervised_batch #