Query artifacts¶

Here, we’ll query artifacts and inspect their metadata.

This guide can be skipped if you are only interested in how to leverage the overall collection.

import lamindb as ln
import bionty as bt

💡 connected lamindb: testuser1/test-scrna

ln.settings.transform.stem_uid = "agayZTonayqA"
ln.settings.transform.version = "1"
ln.track()

💡 notebook imports: bionty==0.43.0 lamindb==0.72.0

💡 saved: Transform(version='1', uid='agayZTonayqA5zKv', name='Query artifacts', key='scrna3', type='notebook', updated_at=2024-05-20 08:34:18 UTC, created_by_id=1)

💡 saved: Run(uid='GAnEnw2bunyQGAEBmpXc', transform_id=3, created_by_id=1)

Query artifacts by provenance metadata¶

users = ln.User.lookup()

ln.Transform.filter(created_by=users.testuser1).search("scrna").df()

	version	uid	name	key	description	type	latest_report_id	source_code_id	reference	reference_type	created_at	updated_at	created_by_id
id
1	1	Nv48yAceNSh85zKv	scRNA-seq	scrna	None	notebook	None	None	None	None	2024-05-20 08:31:15.220791+00:00	2024-05-20 08:31:15.220814+00:00	1
2	1	ManDYgmftZ8C5zKv	Standardize and append a batch of data	scrna2	None	notebook	None	None	None	None	2024-05-20 08:33:50.019072+00:00	2024-05-20 08:33:50.019096+00:00	1
3	1	agayZTonayqA5zKv	Query artifacts	scrna3	None	notebook	None	None	None	None	2024-05-20 08:34:18.788298+00:00	2024-05-20 08:34:18.788322+00:00	1

transform = ln.Transform.filter(uid="Nv48yAceNSh85zKv").one()

ln.Artifact.filter(transform=transform).df()

	version	created_at	created_by_id	updated_at	uid	storage_id	key	suffix	accessor	description	size	hash	hash_type	n_objects	n_observations	transform_id	run_id	visibility	key_is_virtual
id
1	None	2024-05-20 08:33:40.586863+00:00	1	2024-05-20 08:33:44.522165+00:00	8m6FHxYGVzSCR0HutpXV	1	None	.h5ad	AnnData	Human immune cells from Conde22	57612943	9sXda5E7BYiVoDOQkTC0KB	sha1-fl	None	1648	1	1	1	True

Query artifacts by biological metadata¶

organism = bt.Organism.lookup()
tissues = bt.Tissue.lookup()

query = ln.Artifact.filter(
    organisms=organism.human,
    tissues=tissues.bone_marrow,
)

query.df()

	version	created_at	updated_at	uid	key	suffix	accessor	description	size	hash	hash_type	n_objects	n_observations	visibility	key_is_virtual	created_by_id	storage_id	transform_id	run_id
id

Inspect artifact metadata¶

query_set = ln.Artifact.filter().all()
artifact1, artifact2 = query_set[0], query_set[1]

artifact1.describe()

Artifact(updated_at=2024-05-20 08:33:44 UTC, uid='8m6FHxYGVzSCR0HutpXV', suffix='.h5ad', accessor='AnnData', description='Human immune cells from Conde22', size=57612943, hash='9sXda5E7BYiVoDOQkTC0KB', hash_type='sha1-fl', n_observations=1648, visibility=1, key_is_virtual=True)

Provenance:
  📎 created_by: User(uid='DzTjkKse', handle='testuser1', name='Test User1')
  📎 storage: uid='vRrf9HmLWRZd', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', instance_uid='5ZP9QR2HPILj')
  📎 transform: Transform(version='1', uid='Nv48yAceNSh85zKv', name='scRNA-seq', key='scrna', type='notebook')
  📎 run: Run(uid='HAs0nupJNcWDPI55X0Mf', started_at=2024-05-20 08:31:15 UTC, is_consecutive=True)
  📎 input_of (core.Run): ['2024-05-20 08:33:50 UTC']
Features:
  var: FeatureSet(uid='6TthNIyffYvXM8ZjuqQD', n=36503, dtype='float', registry='bionty.Gene')
    'MIR1302-2HG', 'FAM138A', 'OR4F5', 'None', 'OR4F29', 'OR4F16', 'LINC01409', 'FAM87B', 'LINC01128', 'LINC00115', 'FAM41C', 'LINC02593', 'SAMD11', 'NOC2L', 'KLHL17', 'PLEKHN1', 'PERM1', 'HES4'
  obs: FeatureSet(uid='nMzktuMdWrMztdGcNVQ9', n=4, registry='Feature')
    🔗 donor (4, cat[ULabel]): 'D496', '621B', 'A29', 'A36', 'A35', '637C', 'A52', 'A37', 'D503', '640C'
    🔗 tissue (4, cat[bionty.Tissue]): 'blood', 'thoracic lymph node', 'spleen', 'lung', 'mesenteric lymph node', 'lamina propria', 'liver', 'jejunal epithelium', 'omentum', 'bone marrow'
    🔗 cell_type (4, cat[bionty.CellType]): 'classical monocyte', 'T follicular helper cell', 'memory B cell', 'alveolar macrophage', 'naive thymus-derived CD4-positive, alpha-beta T cell', 'effector memory CD8-positive, alpha-beta T cell, terminally differentiated', 'alpha-beta T cell', 'CD4-positive helper T cell', 'naive thymus-derived CD8-positive, alpha-beta T cell', 'macrophage'
    🔗 assay (4, cat[bionty.ExperimentalFactor]): '10x 3' v3', '10x 5' v2', '10x 5' v1'
Labels:
  📎 tissues (17, bionty.Tissue): 'blood', 'thoracic lymph node', 'spleen', 'lung', 'mesenteric lymph node', 'lamina propria', 'liver', 'jejunal epithelium', 'omentum', 'bone marrow'
  📎 cell_types (32, bionty.CellType): 'classical monocyte', 'T follicular helper cell', 'memory B cell', 'alveolar macrophage', 'naive thymus-derived CD4-positive, alpha-beta T cell', 'effector memory CD8-positive, alpha-beta T cell, terminally differentiated', 'alpha-beta T cell', 'CD4-positive helper T cell', 'naive thymus-derived CD8-positive, alpha-beta T cell', 'macrophage'
  📎 experimental_factors (3, bionty.ExperimentalFactor): '10x 3' v3', '10x 5' v2', '10x 5' v1'
  📎 ulabels (12, ULabel): 'D496', '621B', 'A29', 'A36', 'A35', '637C', 'A52', 'A37', 'D503', '640C'

artifact1.view_lineage()

_images/dafd2ab81598bbca91250fbfd974c2717cee0009a44f8794060d681235aa2592.svg

artifact2.describe()

Artifact(updated_at=2024-05-20 08:34:11 UTC, uid='Zr00ySkapCbp3bmuOeGt', suffix='.h5ad', accessor='AnnData', description='10x reference adata', size=857752, hash='0Fozmib89XWbFoD6hSq5yA', hash_type='md5', n_observations=70, visibility=1, key_is_virtual=True)

Provenance:
  📎 created_by: User(uid='DzTjkKse', handle='testuser1', name='Test User1')
  📎 storage: uid='vRrf9HmLWRZd', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', instance_uid='5ZP9QR2HPILj')
  📎 transform: Transform(version='1', uid='ManDYgmftZ8C5zKv', name='Standardize and append a batch of data', key='scrna2', type='notebook')
  📎 run: Run(uid='ka9LM9UnxfRAbPbJ2vRI', started_at=2024-05-20 08:33:50 UTC, is_consecutive=True)
Features:
  var: FeatureSet(uid='7i9Ev3NmCMSrXDHHlIqc', n=754, dtype='float', registry='bionty.Gene')
    'IL18', 'NPM3', 'S100A9', 'CNN2', 'S100A8', 'ARHGAP45', 'RNF34', 'GPX4', 'ADISSP', 'S100A6', 'S100A4', 'FAM174C', 'SIT1', 'CCDC107', 'RSL1D1', 'TLN1', 'TNFRSF17', 'HES4', 'PCNA', 'RAB13'
  obs: FeatureSet(uid='VmkaLzGo32Gr6UlXYWnG', n=1, registry='Feature')
    🔗 cell_type (1, cat[bionty.CellType]): 'dendritic cell', 'effector memory CD4-positive, alpha-beta T cell, terminally differentiated', 'cytotoxic T cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'CD14-positive, CD16-negative classical monocyte', 'B cell, CD19-positive', 'CD16-positive, CD56-dim natural killer cell, human', 'CD38-positive naive B cell', 'CD4-positive, alpha-beta T cell'
Labels:
  📎 cell_types (9, bionty.CellType): 'dendritic cell', 'effector memory CD4-positive, alpha-beta T cell, terminally differentiated', 'cytotoxic T cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'CD14-positive, CD16-negative classical monocyte', 'B cell, CD19-positive', 'CD16-positive, CD56-dim natural killer cell, human', 'CD38-positive naive B cell', 'CD4-positive, alpha-beta T cell'

artifact2.view_lineage()

_images/99a0d7f6b66b95e09e55453274a0eb955ae43841768f840f55e0ca6103392ebe.svg

Compare features¶

Here we compute shared genes:

artifact1_genes = artifact1.features["var"]
artifact2_genes = artifact2.features["var"]

shared_genes = artifact1_genes & artifact2_genes
len(shared_genes)

shared_genes.list("symbol")[:10]

['HES4',
 'TNFRSF4',
 'SSU72',
 'PARK7',
 'RBP7',
 'SRM',
 'MAD2L2',
 'AGTRAP',
 'TNFRSF1B',
 'EFHD2']

Compare cell types¶

artifact1_celltypes = artifact1.cell_types.all()
artifact2_celltypes = artifact2.cell_types.all()

shared_celltypes = artifact1_celltypes & artifact2_celltypes
shared_celltypes_names = shared_celltypes.list("name")
shared_celltypes_names

['CD16-positive, CD56-dim natural killer cell, human']

Load the individual artifacts¶

We could either load the artifacts into memory or access them in backed mode through .backed() to lazily load their content.

Let’s load them into memory:

adata1 = artifact1.load()
adata2 = artifact2.load()

We can now subset the two collections by shared cell types:

adata1_subset = adata1[adata1.obs["cell_type"].isin(shared_celltypes_names)]
adata2_subset = adata2[adata2.obs["cell_type"].isin(shared_celltypes_names)]