Source code for bndl_cassandra.coscan

from functools import partial, lru_cache
import logging

from bndl.compute.dataset import Dataset, Partition
from bndl.util.funcs import identity, getter
from bndl_cassandra import partitioner
from bndl_cassandra.partitioner import estimate_size, SizeEstimate
from bndl_cassandra.session import cassandra_session
from cytoolz.itertoolz import take


logger = logging.getLogger(__name__)


[docs]def get_or_none(index, container): try: return container[index] except IndexError: return None
[docs]class CassandraCoScanDataset(Dataset):
[docs] def __init__(self, *scans, keys=None, dset_id=None): assert len(scans) > 1 scans = list(scans) scan0 = scans[0] super().__init__(scan0.ctx, src=scans, dset_id=dset_id) for idx, scan in enumerate(scans[1:], 1): if isinstance(scan, str): if '.' in scan: keyspace, table = scan.split('.', 1) else: keyspace, table = scan0.keyspace, scan scan = self.ctx.cassandra_table(keyspace, table) scans[idx] = scan assert scan.contact_points == scan0.contact_points, "only scan in parallel within the same cluster" assert scan.keyspace == scan0.keyspace, "only scan in parallel within the same keyspace" assert len(set(scan.table for scan in scans)) == len(scans), "don't scan the same table twice" self.contact_points = scan0.contact_points self.keyspace = scan0.keyspace self.srcparts = [src.parts() for src in scans] self.pcount = len(self.srcparts[0]) # TODO check format (dicts, tuples, namedtuples, etc.) # TODO adapt keyfuncs to below if isinstance(keys, str): self.keys = keys = [keys] * len(scans) else: self.keys = keys with cassandra_session(self.ctx, contact_points=self.contact_points) as session: ks_meta = session.cluster.metadata.keyspaces[self.keyspace] tbl_metas = [ks_meta.tables[scan.table] for scan in scans] if not keys: primary_key_length = len(tbl_metas[0].primary_key) for tbl_meta in tbl_metas[1:]: assert len(tbl_meta.primary_key) == primary_key_length, \ "can't co-scan without keys with varying primary key length" self.keyfuncs = [partial(take, primary_key_length)] * len(scans) self.grouptransforms = [partial(get_or_none, 0)] * len(scans) else: assert len(keys) == len(scans), \ "provide a key for each table scanned or none at all" self.keyfuncs = [] self.grouptransforms = [] for key, scan, tbl_meta in zip(keys, scans, tbl_metas): if isinstance(key, str): key = (key,) keylen = len(key) assert len(tbl_meta.partition_key) <= keylen, \ "can't co-scan over a table keyed by part of the partition key" assert tuple(key) == tuple(c.name for c in tbl_meta.primary_key)[:keylen], \ "the key columns must be the first part (or all) of the primary key" assert scan._select is None or tuple(key) == tuple(scan._select)[:keylen], \ "select all columns or the primary key columns in the order as they " \ "are defined in the CQL schema" self.keyfuncs.append(partial(take, keylen)) if keylen == len(tbl_meta.primary_key): self.grouptransforms.append(partial(get_or_none, 0)) else: self.grouptransforms.append(identity)
[docs] def coscan(self, *others, keys=None): assert len(others) > 0 return CassandraCoScanDataset(*tuple(self.src) + others, keys=keys)
@lru_cache()
[docs] def parts(self): from bndl_cassandra.dataset import CassandraScanPartition with cassandra_session(self.ctx, contact_points=self.contact_points) as session: size_estimates = sum((estimate_size(session, self.keyspace, src.table) for src in self.src), SizeEstimate(0, 0, 0)) partitions = partitioner.partition_ranges(self.ctx, self.contact_points, self.keyspace, size_estimates=size_estimates) return [ CassandraCoScanPartition(self, idx, [CassandraScanPartition(scan, idx, *part) for scan in self.src]) for idx, part in enumerate(partitions) ]
[docs]class CassandraCoScanPartition(Partition):
[docs] def __init__(self, dset, idx, scans): super().__init__(dset, idx) self.scans = scans
def _locality(self, workers): return self.scans[0]._locality(workers) def _compute(self): keyfuncs = self.dset.keyfuncs grouptransforms = self.dset.grouptransforms subscans = [scan.compute() for scan in self.scans] merged = {} for cidx, scan in enumerate(subscans): keyf = keyfuncs[cidx] for row in scan: key = tuple(keyf(row)) batch = merged.get(key) if not batch: merged[key] = batch = [[] for _ in subscans] batch[cidx].append(row) for key, groups in merged.items(): for idx, (group, transform) in enumerate(zip(groups, grouptransforms)): groups[idx] = transform(group) yield key, groups