diff --git a/.github/workflows/checks.yaml b/.github/workflows/checks.yaml
index dfeafd731..da9ff3184 100644
--- a/.github/workflows/checks.yaml
+++ b/.github/workflows/checks.yaml
@@ -40,6 +40,9 @@ jobs:
- name: Print dependencies
run: uv pip list
+ - name: Install Playwright
+ run: make install-playwright
+
- name: Run basedpyright
run: uv run basedpyright
diff --git a/.gitignore b/.gitignore
index 64704d4e2..168523b42 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@ docs/coverage/**
artifacts/**
docs/dep_graph/**
tests/.temp/**
+*.prof
**/out/
neuronpedia_outputs/
diff --git a/Makefile b/Makefile
index 4cc60fe44..46af20a14 100644
--- a/Makefile
+++ b/Makefile
@@ -77,6 +77,50 @@ coverage:
uv run python -m coverage html --directory=$(COVERAGE_DIR)/html/
+BUNDLED_DASHBOARD_DIR=spd/clustering/dashboard/_bundled
+
+.PHONY: bundle-dashboard
+bundle-dashboard:
+ @mkdir -p $(BUNDLED_DASHBOARD_DIR)
+ uv run python -m muutils.web.bundle_html \
+ spd/clustering/dashboard/index.html \
+ --output $(BUNDLED_DASHBOARD_DIR)/index.html \
+ --source-dir spd/clustering/dashboard
+ uv run python -m muutils.web.bundle_html \
+ spd/clustering/dashboard/cluster.html \
+ --output $(BUNDLED_DASHBOARD_DIR)/cluster.html \
+ --source-dir spd/clustering/dashboard
+ @echo "Bundled HTML files to $(BUNDLED_DASHBOARD_DIR)/"
+
+.PHONY: clean-test-dashboard
+clean-test-dashboard:
+ rm -rf tests/.temp/dashboard-integration
+
+
+.PHONY: install-playwright
+install-playwright:
+ @echo "Install Playwright browsers, used for dashboard tests"
+ uv run playwright install chromium
+ uv run playwright install-deps
+
+.PHONY: test-dashboard
+test-dashboard: clean-test-dashboard bundle-dashboard
+ pytest tests/clustering/dashboard/test_dashboard_integration.py --runslow -v --durations 10
+
+
+.PHONY: clustering-dashboard
+clustering-dashboard: bundle-dashboard
+ uv run python spd/clustering/dashboard/run.py \
+ spd/clustering/dashboard/dashboard_config.yaml
+
+.PHONY: clustering-dashboard-profile
+clustering-dashboard-profile: bundle-dashboard
+ uv run python -m cProfile -o dashboard.prof spd/clustering/dashboard/run.py \
+ spd/clustering/dashboard/dashboard_config.yaml
+ @echo "\nProfile saved to dashboard.prof"
+ @echo "View with: python -m pstats dashboard.prof"
+ @echo "Or install snakeviz and run: snakeviz dashboard.prof"
+
.PHONY: clean
clean:
@echo "Cleaning Python cache and build artifacts..."
diff --git a/TODO.md b/TODO.md
deleted file mode 100644
index 9e6f14815..000000000
--- a/TODO.md
+++ /dev/null
@@ -1,73 +0,0 @@
-# TODO: Cluster Coactivation Matrix Implementation
-
-## What Was Changed
-
-### 1. Added `ClusterActivations` dataclass (`spd/clustering/dashboard/compute_max_act.py`)
-- New dataclass to hold vectorized cluster activations for all clusters
-- Contains `activations` tensor [n_samples, n_clusters] and `cluster_indices` list
-
-### 2. Added `compute_all_cluster_activations()` function
-- Vectorized computation of all cluster activations at once
-- Replaces the per-cluster loop for better performance
-- Returns `ClusterActivations` object
-
-### 3. Added `compute_cluster_coactivations()` function
-- Computes coactivation matrix from list of `ClusterActivations` across batches
-- Binarizes activations (acts > 0) and computes matrix multiplication: `activation_mask.T @ activation_mask`
-- Follows the pattern from `spd/clustering/merge.py:69`
-- Returns tuple of (coactivation_matrix, cluster_indices)
-
-### 4. Modified `compute_max_activations()` function
-- Now accumulates `ClusterActivations` from each batch in `all_cluster_activations` list
-- Calls `compute_cluster_coactivations()` to compute the matrix
-- **Changed return type**: now returns `tuple[DashboardData, np.ndarray, list[int]]`
- - Added coactivation matrix and cluster_indices to return value
-
-### 5. Modified `spd/clustering/dashboard/run.py`
-- Updated to handle new return value from `compute_max_activations()`
-- Saves coactivation matrix as `coactivations.npz` in the dashboard output directory
-- NPZ file contains:
- - `coactivations`: the [n_clusters, n_clusters] matrix
- - `cluster_indices`: array mapping matrix positions to cluster IDs
-
-## What Needs to be Checked
-
-### Testing
-- [ ] **Run the dashboard pipeline** on a real clustering run to verify:
- - Coactivation computation doesn't crash
- - Coactivations are saved correctly to NPZ file
- - Matrix dimensions are correct
- - `cluster_indices` mapping is correct
-
-### Type Checking
-- [ ] Run `make type` to ensure no type errors were introduced
-- [ ] Verify jaxtyping annotations are correct
-
-### Verification
-- [ ] Load a saved `coactivations.npz` file and verify:
- ```python
- data = np.load("coactivations.npz")
- coact = data["coactivations"]
- cluster_indices = data["cluster_indices"]
- # Check: coact should be symmetric
- # Check: diagonal should be >= off-diagonal (clusters coactivate with themselves most)
- # Check: cluster_indices length should match coact.shape[0]
- ```
-
-### Performance
-- [ ] Check if vectorization actually improved performance
-- [ ] Monitor memory usage with large numbers of clusters
-
-### Edge Cases
-- [ ] Test with clusters that have zero activations
-- [ ] Test with single-batch runs
-- [ ] Test with very large number of clusters
-
-### Integration
-- [ ] Verify the coactivation matrix can be used in downstream analysis
-- [ ] Consider if visualization of coactivations should be added to dashboard
-
-## Notes
-- The coactivation matrix is computed over all samples processed (n_batches * batch_size * seq_len samples)
-- Binarization threshold is currently hardcoded as `> 0` - may want to make this configurable
-- The computation happens in the dashboard pipeline, NOT during the main clustering pipeline
diff --git a/pyproject.toml b/pyproject.toml
index c59039aae..da1d4b24d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,7 @@ dependencies = [
"simple_stories_train @ git+https://github.com/goodfire-ai/simple_stories_train.git@dev",
"scipy>=1.14.1",
"muutils",
+ "zanj", # for dashboard data saving/loading
"fastapi",
"uvicorn",
]
@@ -42,6 +43,7 @@ dev = [
"ruff",
"basedpyright<1.32.0", # pyright and wandb issues, see https://github.com/goodfire-ai/spd/pull/232
"pre-commit",
+ "playwright", # for browser-based integration tests
]
[project.scripts]
diff --git a/spd/clustering/activations.py b/spd/clustering/activations.py
index cd6a2b742..66c0672b1 100644
--- a/spd/clustering/activations.py
+++ b/spd/clustering/activations.py
@@ -10,7 +10,7 @@
ActivationsTensor,
BoolActivationsTensor,
ClusterCoactivationShaped,
- ComponentLabels,
+ SubComponentKey,
)
from spd.clustering.util import ModuleFilterFunc
from spd.models.component_model import ComponentModel, OutputWithCache
@@ -54,16 +54,16 @@ class FilteredActivations(NamedTuple):
activations: ActivationsTensor
"activations after filtering dead components"
- labels: ComponentLabels
- "list of length c with labels for each preserved component"
+ subcomponent_keys: list[SubComponentKey]
+ "list of length c with SubComponentInfo for each preserved component"
- dead_components_labels: ComponentLabels | None
- "list of labels for dead components, or None if no filtering was applied"
+ dead_subcomponent_keys: list[SubComponentKey] | None
+ "list of SubComponentInfo for dead components, or None if no filtering was applied"
@property
def n_alive(self) -> int:
"""Number of alive components after filtering."""
- n_alive: int = len(self.labels)
+ n_alive: int = len(self.subcomponent_keys)
assert n_alive == self.activations.shape[1], (
f"{n_alive = } != {self.activations.shape[1] = }"
)
@@ -72,12 +72,12 @@ def n_alive(self) -> int:
@property
def n_dead(self) -> int:
"""Number of dead components after filtering."""
- return len(self.dead_components_labels) if self.dead_components_labels else 0
+ return len(self.dead_subcomponent_keys) if self.dead_subcomponent_keys else 0
def filter_dead_components(
activations: ActivationsTensor,
- labels: ComponentLabels,
+ subcomponent_keys: list[SubComponentKey],
filter_dead_threshold: float = 0.01,
) -> FilteredActivations:
"""Filter out dead components based on a threshold
@@ -86,31 +86,29 @@ def filter_dead_components(
activations and labels are returned as is, `dead_components_labels` is `None`.
otherwise, components whose **maximum** activations across all samples is below the threshold
- are considered dead and filtered out. The labels of these components are returned in `dead_components_labels`.
+ are considered dead and filtered out. The SubComponentInfo of these components are returned in `dead_components_labels`.
`dead_components_labels` will also be `None` if no components were below the threshold.
"""
- dead_components_lst: ComponentLabels | None = None
+ dead_components_lst: list[SubComponentKey] | None = None
if filter_dead_threshold > 0:
- dead_components_lst = ComponentLabels(list())
+ dead_components_lst = []
max_act: Float[Tensor, " c"] = activations.max(dim=0).values
- dead_components: Bool[Tensor, " c"] = max_act < filter_dead_threshold
+ dead_components_mask: Bool[Tensor, " c"] = max_act < filter_dead_threshold
- if dead_components.any():
- activations = activations[:, ~dead_components]
- alive_labels: list[tuple[str, bool]] = [
- (lbl, bool(keep.item()))
- for lbl, keep in zip(labels, ~dead_components, strict=False)
+ if dead_components_mask.any():
+ activations = activations[:, ~dead_components_mask]
+ alive_labels: list[tuple[SubComponentKey, bool]] = [
+ (comp, bool(keep.item()))
+ for comp, keep in zip(subcomponent_keys, ~dead_components_mask, strict=False)
]
# re-assign labels only if we are filtering
- labels = ComponentLabels([label for label, keep in alive_labels if keep])
- dead_components_lst = ComponentLabels(
- [label for label, keep in alive_labels if not keep]
- )
+ subcomponent_keys = [comp for comp, keep in alive_labels if keep]
+ dead_components_lst = [comp for comp, keep in alive_labels if not keep]
return FilteredActivations(
activations=activations,
- labels=labels,
- dead_components_labels=dead_components_lst if dead_components_lst else None,
+ subcomponent_keys=subcomponent_keys,
+ dead_subcomponent_keys=dead_components_lst if dead_components_lst else None,
)
@@ -124,11 +122,11 @@ class ProcessedActivations:
activations: ActivationsTensor
"activations after filtering and concatenation"
- labels: ComponentLabels
- "list of length c with labels for each preserved component, format `{module_name}:{component_index}`"
+ subcomponent_keys: list[SubComponentKey]
+ "list of length c with SubComponentInfo for each preserved component"
- dead_components_lst: ComponentLabels | None
- "list of labels for dead components, or None if no filtering was applied"
+ dead_subcomponent_keys: list[SubComponentKey] | None
+ "list of SubComponentInfo for dead components, or None if no filtering was applied"
def validate(self) -> None:
"""Validate the processed activations"""
@@ -143,7 +141,7 @@ def n_components_original(self) -> int:
@property
def n_components_alive(self) -> int:
"""Number of alive components after filtering. equal to the length of `labels`"""
- n_alive: int = len(self.labels)
+ n_alive: int = len(self.subcomponent_keys)
assert n_alive + self.n_components_dead == self.n_components_original, (
f"({n_alive = }) + ({self.n_components_dead = }) != ({self.n_components_original = })"
)
@@ -156,26 +154,26 @@ def n_components_alive(self) -> int:
@property
def n_components_dead(self) -> int:
"""Number of dead components after filtering. equal to the length of `dead_components_lst` if it is not None, or 0 otherwise"""
- return len(self.dead_components_lst) if self.dead_components_lst else 0
+ return len(self.dead_subcomponent_keys) if self.dead_subcomponent_keys else 0
@cached_property
def label_index(self) -> dict[str, int | None]:
- """Create a mapping from label to alive index (`None` if dead)"""
+ """Create a mapping from label string to alive index (`None` if dead)"""
return {
- **{label: i for i, label in enumerate(self.labels)},
+ **{comp.label: i for i, comp in enumerate(self.subcomponent_keys)},
**(
- {label: None for label in self.dead_components_lst}
- if self.dead_components_lst
+ {comp.label: None for comp in self.dead_subcomponent_keys}
+ if self.dead_subcomponent_keys
else {}
),
}
def get_label_index(self, label: str) -> int | None:
- """Get the index of a label in the activations, or None if it is dead"""
+ """Get the index of a label string in the activations, or None if it is dead"""
return self.label_index[label]
def get_label_index_alive(self, label: str) -> int:
- """Get the index of a label in the activations, or raise if it is dead"""
+ """Get the index of a label string in the activations, or raise if it is dead"""
idx: int | None = self.get_label_index(label)
if idx is None:
raise ValueError(f"Label '{label}' is dead and has no index in the activations.")
@@ -239,10 +237,10 @@ def process_activations(
# compute the labels and total component count
total_c: int = 0
- labels: ComponentLabels = ComponentLabels(list())
+ labels: list[SubComponentKey] = []
for key, act in activations_.items():
c: int = act.shape[-1]
- labels.extend([f"{key}:{i}" for i in range(c)])
+ labels.extend([SubComponentKey(module=key, index=i) for i in range(c)])
total_c += c
# concat the activations
@@ -251,7 +249,7 @@ def process_activations(
# filter dead components
filtered_components: FilteredActivations = filter_dead_components(
activations=act_concat,
- labels=labels,
+ subcomponent_keys=labels,
filter_dead_threshold=filter_dead_threshold,
)
@@ -262,6 +260,6 @@ def process_activations(
return ProcessedActivations(
activations_raw=activations_,
activations=filtered_components.activations,
- labels=filtered_components.labels,
- dead_components_lst=filtered_components.dead_components_labels,
+ subcomponent_keys=filtered_components.subcomponent_keys,
+ dead_subcomponent_keys=filtered_components.dead_subcomponent_keys,
)
diff --git a/spd/clustering/consts.py b/spd/clustering/consts.py
index 8a9647dc8..3e48f3648 100644
--- a/spd/clustering/consts.py
+++ b/spd/clustering/consts.py
@@ -1,8 +1,10 @@
"""Constants and shared abstractions for clustering pipeline."""
+import hashlib
from abc import ABC, abstractmethod
+from dataclasses import dataclass
from pathlib import Path
-from typing import Literal, NewType
+from typing import Literal, NewType, override
import numpy as np
from jaxtyping import Bool, Float, Int
@@ -15,8 +17,39 @@
DistancesArray = Float[np.ndarray, "n_iters n_ens n_ens"]
# Component and label types (NewType for stronger type safety)
-ComponentLabel = NewType("ComponentLabel", str) # Format: "module_name:component_index"
-ComponentLabels = NewType("ComponentLabels", list[str])
+SubComponentLabel = NewType("SubComponentLabel", str) # Format: "module_name:component_index"
+
+
+@dataclass(frozen=True, slots=True, kw_only=True)
+class SubComponentKey:
+ """unique identifier of a subcomponent. indices can refer to dead components"""
+
+ module: str
+ index: int
+
+ @property
+ def label(self) -> SubComponentLabel:
+ """Component label as 'module:index'."""
+ return SubComponentLabel(f"{self.module}:{self.index}")
+
+ @classmethod
+ def from_label(cls, label: SubComponentLabel) -> "SubComponentKey":
+ """Create SubComponentInfo from a component label."""
+ assert label.count(":") == 1, (
+ "Invalid component label format, expected '{{module}}:{{index}}'"
+ )
+ module, index_str = label.rsplit(":", 1)
+ return cls(module=module, index=int(index_str))
+
+ @override
+ def __str__(self) -> str:
+ return self.label
+
+ @override
+ def __hash__(self) -> int:
+ return int(hashlib.md5(str(self).encode()).hexdigest(), 16)
+
+
BatchId = NewType("BatchId", str)
# Path types
diff --git a/spd/clustering/dashboard/README.md b/spd/clustering/dashboard/README.md
new file mode 100644
index 000000000..1f4892796
--- /dev/null
+++ b/spd/clustering/dashboard/README.md
@@ -0,0 +1,52 @@
+# 2025-10-02 15:17
+
+For cluster view:
+
+- SAE/logitlens/tunedlens decoding of read and write directions for each cluster
+- github comments interface
+- more plots
+- base frequency of each token in dataset
+- do some kind of clustering on tokens via embedding space, to label their groups. ideally so we can make a histogram of which groups of tokens this cluster activates on
+- measure of how "concentrated" the subcomponents are per module
+- measure of depth in the model
+- measure of how "attention-y" vs "MLP-y" the clusters subcomponents are
+- skewed vs uniform activation frequencies, skewed vs uniform max act position
+ - maybe not max act pos, but overall mass across positions
+
+For list view (or wandb view?):
+
+- stats of tok concentration, entropy, subcomp in module concentration, etc across all clusters
+- some kind of embedding of clusters, 3d view, click on pt to go to cluster view
+
+
+
+
+
+# TODO
+
+## static:
+
+
+- makeup of activations on a per-subcomponent basis
+- makeup of clusters -- which modules do they have subcomponents from?
+- [ ] features in 2d plane -- display them as vector fields, with points in that 2d plane colored corresponding to various SAE features
+
+
+
+
+
+# causal masks
+
+one of the things we might want to do is:
+
+- define a causal mask, using:
+ - some subset of the data (a set of prompts)
+ - some other method
+- run inference using that particular causal mask on some other dataset
+
+this requires an easy way to define and use custom causal masks. a good solution might be something like:
+
+- interface to define a causal mask, by providing a dataset and/or manually editing
+ - it should have a button to "label" a causal mask -- probably, we can hash the causal mask, save the mask to a file on the backend, and use that hash as a key
+ - copy the hash
+- in other interfaces for doing inference with the mask, we can paste the hash to specify a causal mask
diff --git a/spd/clustering/dashboard/TODO.md b/spd/clustering/dashboard/TODO.md
new file mode 100644
index 000000000..bfcebd5ae
--- /dev/null
+++ b/spd/clustering/dashboard/TODO.md
@@ -0,0 +1,82 @@
+# Dashboard Refactor TODO
+
+This document tracks potential cleanup tasks discovered during the ZANJ refactor.
+
+## Completed ✅
+
+- Converted `DashboardData` to `SerializableDataclass`
+- Added `ClusterSample` dataclass for self-contained samples
+- Updated `ClusterData.generate()` to create self-contained samples
+- Refactored JavaScript to use `ZanjLoader` instead of manual file loading
+- Fixed `this.baseUrl` bug in `zanj.js`
+- Simplified config.js to remove deprecated file paths
+
+## Potential Cleanup Tasks
+
+### Python Backend
+
+1. **Remove deprecated fields in `ClusterData`** (`spd/clustering/dashboard/core/cluster_data.py:85`)
+ - `criterion_samples` can be removed once JS is fully migrated
+ - Currently kept for backward compatibility
+
+2. **Remove deprecated methods in `DashboardData`** (`spd/clustering/dashboard/core/dashboard_data.py:102-103`)
+ - Comment indicates deprecated methods can be removed
+ - ZANJ handles serialization automatically via `SerializableDataclass`
+
+3. **Simplify `ClusterData.serialize()`** (`spd/clustering/dashboard/core/cluster_data.py:259-298`)
+ - Currently has manual serialization logic
+ - Could potentially convert to `SerializableDataclass` if `.serialize()` isn't needed elsewhere
+ - Check if manual `BinnedData` serialization is still required
+
+4. **Remove `get_unique_activation_hashes()` method** (`spd/clustering/dashboard/core/cluster_data.py:217-225`)
+ - No longer needed since activations are self-contained in samples
+ - Was only used by old `DashboardData.save()` logic
+
+### JavaScript Frontend
+
+5. **Remove deprecated component activation logic** (`spd/clustering/dashboard/js/cluster-detail.js:11-12`)
+ - `componentActivations` and `enabledComponents` variables
+ - Related component toggle UI code (if no longer used)
+
+6. **Remove `combineComponentActivations()` function** (if exists in cluster-detail.js)
+ - Was used for combining component activations
+ - No longer needed with self-contained samples
+
+7. **Clean up config.js file paths** (`spd/clustering/dashboard/js/util/config.js:59-63`)
+ - Remove commented-out deprecated file paths
+ - Only `explanations` is needed now
+
+8. **Alpine.js model info component simplification** (`spd/clustering/dashboard/js/cluster-selection.js:6-10`)
+ - Could potentially be simplified further
+ - Data is now set directly from ZANJ load
+
+### Testing & Validation
+
+9. **Test with existing dashboard data**
+ - Verify backward compatibility with old data format (if needed)
+ - Test explanations.jsonl loading still works
+
+10. **Verify float16 handling**
+ - Check if ZANJ preserves float16 dtype for activations
+ - May need `serializable_field` configuration
+
+11. **Test lazy loading behavior**
+ - Verify large activation arrays lazy load correctly
+ - Check memory usage with large datasets
+
+### Documentation
+
+12. **Update dashboard README** (if exists)
+ - Document new ZANJ-based data format
+ - Explain self-contained cluster structure
+ - Update data generation examples
+
+13. **Add ZANJ dependency to requirements**
+ - Ensure `zanj` is in `pyproject.toml`
+ - Document minimum version if applicable
+
+## Notes
+
+- The refactor maintains backward compatibility where possible via deprecated fields
+- Component-level activation display may need revisiting if that feature is still used
+- Consider adding progress indicators to ZANJ loading in the future
diff --git a/spd/clustering/dashboard/__init__.py b/spd/clustering/dashboard/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spd/clustering/dashboard/_bundled/__init__.py b/spd/clustering/dashboard/_bundled/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spd/clustering/dashboard/_bundled/cluster.html b/spd/clustering/dashboard/_bundled/cluster.html
new file mode 100644
index 000000000..5babd2b98
--- /dev/null
+++ b/spd/clustering/dashboard/_bundled/cluster.html
@@ -0,0 +1,4680 @@
+
+
+
+ Cluster Details
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
📋 Copy Explanation Template
+
Explanation: No explanation
+
+
+
+
+
+
+
+
Activation Statistics
+
+
+
+
+
+
+
+
+
+
+
+ Loading cluster data...
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/spd/clustering/dashboard/_bundled/index.html b/spd/clustering/dashboard/_bundled/index.html
new file mode 100644
index 000000000..63f4f0039
--- /dev/null
+++ b/spd/clustering/dashboard/_bundled/index.html
@@ -0,0 +1,4775 @@
+
+
+
+ Cluster Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Cluster Selection Dashboard
+
+
+
Model Information
+
+
+ Total Modules:
+
+
+ Total Components:
+
+
+ Total Clusters:
+
+
+ Model Parameters:
+
+
+ Iteration:
+
+
+ Component Size:
+
+
+ Source SPD Run:
+
+
+ Clustering Run:
+
+
+ Pretrained Model:
+
+
+
+ Configuration Details
+
+
+ Seed:
+
+
+ Steps:
+
+
+ Learning Rate:
+
+
+ Batch Size:
+
+
+ Sigmoid Type:
+
+
+ Sampling:
+
+
+ LR Schedule:
+
+
+ Output Loss:
+
+
+
+
+
+
+ Loading data...
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/spd/clustering/dashboard/cluster.html b/spd/clustering/dashboard/cluster.html
new file mode 100644
index 000000000..70a18ce3b
--- /dev/null
+++ b/spd/clustering/dashboard/cluster.html
@@ -0,0 +1,81 @@
+
+
+
+ Cluster Details
+
+
+
+
+
+
+
+
+
+
📋 Copy Explanation Template
+
Explanation: No explanation
+
+
+
+
+
+
+
+
Activation Statistics
+
+
+
+
+
+
+
+
+
+
+
+ Loading cluster data...
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/spd/clustering/dashboard/core/__init__.py b/spd/clustering/dashboard/core/__init__.py
new file mode 100644
index 000000000..f95513e57
--- /dev/null
+++ b/spd/clustering/dashboard/core/__init__.py
@@ -0,0 +1,68 @@
+"""Dashboard core data structures - modular package.
+
+This package provides all data structures for the clustering dashboard.
+All symbols are re-exported from this __init__ for backward compatibility.
+"""
+
+from spd.clustering.consts import SubComponentKey
+from spd.clustering.dashboard.core.base import (
+ ACTIVATION_SAMPLE_BATCH_STATS,
+ ActivationSampleBatch,
+ ActivationSampleHash,
+ ClusterId,
+ ClusterIdHash,
+ ClusterLabel,
+ Direction,
+ TextSample,
+ TextSampleHash,
+ TrackingCriterionHash,
+)
+from spd.clustering.dashboard.core.batch_storage import BatchProcessingStorage
+from spd.clustering.dashboard.core.cluster_data import (
+ BinnedData,
+ ClusterData,
+ TrackingCriterion,
+)
+from spd.clustering.dashboard.core.compute_helpers import (
+ ClusterActivations,
+ ComponentMetrics,
+ compute_all_cluster_activations,
+ compute_cluster_coactivations,
+ compute_component_coactivations_in_cluster,
+ compute_component_cosine_similarities,
+ compute_component_metrics_from_storage,
+)
+from spd.clustering.dashboard.core.dashboard_data import DashboardData
+
+__all__ = [
+ # Type aliases from base
+ "TextSampleHash",
+ "ActivationSampleHash",
+ "ClusterIdHash",
+ "TrackingCriterionHash",
+ "ClusterLabel",
+ "Direction",
+ # Classes from base
+ "SubComponentKey",
+ "ClusterId",
+ "TextSample",
+ "ActivationSampleBatch",
+ # Constants from base
+ "ACTIVATION_SAMPLE_BATCH_STATS",
+ # Classes from cluster_data
+ "TrackingCriterion",
+ "BinnedData",
+ "ClusterData",
+ # Classes from dashboard_data
+ "DashboardData",
+ # Classes from batch_storage
+ "BatchProcessingStorage",
+ # Classes and functions from compute_helpers
+ "ClusterActivations",
+ "ComponentMetrics",
+ "compute_all_cluster_activations",
+ "compute_cluster_coactivations",
+ "compute_component_coactivations_in_cluster",
+ "compute_component_cosine_similarities",
+ "compute_component_metrics_from_storage",
+]
diff --git a/spd/clustering/dashboard/core/base.py b/spd/clustering/dashboard/core/base.py
new file mode 100644
index 000000000..444e9fb63
--- /dev/null
+++ b/spd/clustering/dashboard/core/base.py
@@ -0,0 +1,140 @@
+"""Foundational data structures and type aliases for dashboard."""
+
+import hashlib
+from collections.abc import Callable
+from dataclasses import dataclass
+from functools import cached_property
+from typing import Literal, NewType
+
+import numpy as np
+from jaxtyping import Float
+from muutils.json_serialize import SerializableDataclass, serializable_dataclass
+
+# Type aliases
+TextSampleHash = NewType("TextSampleHash", str)
+ActivationSampleHash = NewType("ActivationSampleHash", str)
+ClusterIdHash = NewType("ClusterIdHash", str)
+TrackingCriterionHash = NewType("TrackingCriterionHash", str)
+ClusterLabel = NewType("ClusterLabel", int) # Just a cluster index
+Direction = Literal["max", "min"]
+
+_SEPARATOR_1: str = "." # For cluster_id parts (e.g., runid-iteration-label)
+_SEPARATOR_2: str = ":" # ONLY for component labels (e.g., module:component_index)
+_SEPARATOR_3: str = "|" # For ALL activation hashes (cluster|text or cluster|comp|text)
+
+
+@dataclass(frozen=True, slots=True, kw_only=True)
+class ClusterId:
+ """Unique identifier for a cluster. This should uniquely identify a cluster *globally*"""
+
+ clustering_run: str # Clustering run identifier
+ iteration: int # Merge iteration number
+ cluster_label: ClusterLabel # Cluster index
+
+ def to_tuple(self) -> tuple[str, int, int]:
+ """Return as a tuple of identifying components."""
+ return (
+ self.clustering_run,
+ self.iteration,
+ self.cluster_label,
+ )
+
+ def to_string(self) -> ClusterIdHash:
+ """Hash uniquely identifying this cluster."""
+ # Use all identifying information to create unique hash
+ parts_tuple = self.to_tuple()
+ parts_str = tuple(str(part) for part in parts_tuple)
+ assert all(_SEPARATOR_1 not in part for part in parts_str), (
+ f"Parts cannot contain separator {_SEPARATOR_1=}, {parts_tuple=}, {parts_str=}"
+ )
+ assert all(_SEPARATOR_2 not in part for part in parts_str), (
+ f"Parts cannot contain separator {_SEPARATOR_2=}, {parts_tuple=}, {parts_str=}"
+ )
+
+ return ClusterIdHash(_SEPARATOR_1.join(parts_str))
+
+ @classmethod
+ def from_string(cls, s: ClusterIdHash) -> "ClusterId":
+ """Create ClusterId from its string representation."""
+ parts: list[str] = s.split(_SEPARATOR_1)
+ if len(parts) != 3:
+ raise ValueError(f"Invalid ClusterId string: {s}")
+ return cls(
+ clustering_run=parts[0],
+ iteration=int(parts[1]),
+ cluster_label=ClusterLabel(int(parts[2])),
+ )
+
+
+@dataclass(frozen=True, kw_only=True)
+class TextSample:
+ """Text content, with a reference to the dataset. depends on tokenizer used."""
+
+ full_text: str
+ tokens: list[str]
+
+ @cached_property
+ def text_hash(self) -> TextSampleHash:
+ """Hash of full_text for deduplication."""
+ return TextSampleHash(hashlib.sha256(self.full_text.encode()).hexdigest()[:8])
+
+ def length(self) -> int:
+ """Return the number of tokens."""
+ return len(self.tokens)
+
+
+@dataclass(frozen=True, kw_only=True)
+class ActivationSampleBatch:
+ cluster_id: ClusterId
+ text_hashes: list[TextSampleHash]
+ activations: Float[np.ndarray, "batch n_ctx"]
+ tokens: list[list[str]] | None = None # Token strings for each sample
+
+ @cached_property
+ def activation_hashes(self) -> list[ActivationSampleHash]:
+ """Hashes uniquely identifying each activation sample (cluster_hash | text_hash)."""
+ cluster_str = self.cluster_id.to_string()
+ return [ActivationSampleHash(f"{cluster_str}{_SEPARATOR_3}{th}") for th in self.text_hashes]
+
+ @cached_property
+ def activation_hashes_short(self) -> list[str]:
+ """Short hashes for frontend (clusterLabel|textHash) without run ID and iteration."""
+ cluster_label = str(self.cluster_id.cluster_label)
+ return [f"{cluster_label}{_SEPARATOR_3}{th}" for th in self.text_hashes]
+
+ @property
+ def shape(self) -> tuple[int, int]:
+ """Return the shape of the activations array (batch_size, n_ctx)."""
+ return self.activations.shape
+
+ def __len__(self) -> int:
+ """Return the number of samples in the batch."""
+ n_samples: int = self.activations.shape[0]
+ assert len(self.text_hashes) == n_samples, "Mismatch between text_hashes and activations"
+ return n_samples
+
+
+ACTIVATION_SAMPLE_BATCH_STATS: dict[
+ str, Callable[[ActivationSampleBatch], Float[np.ndarray, " batch"]]
+] = dict(
+ mean_activation=lambda batch: np.mean(batch.activations, axis=1),
+ min_activation=lambda batch: np.min(batch.activations, axis=1),
+ median_activation=lambda batch: np.median(batch.activations, axis=1),
+ max_activation=lambda batch: np.max(batch.activations, axis=1),
+ max_position=lambda batch: np.argmax(batch.activations, axis=1).astype(float),
+)
+
+
+@serializable_dataclass # pyright: ignore[reportUntypedClassDecorator]
+class ClusterSample(SerializableDataclass):
+ """Self-contained sample combining text reference, tokens, and activations.
+
+ This allows clusters to be self-contained without requiring external lookups.
+ """
+
+ text_hash: str # Reference to TextSample in text_samples dict
+ tokens: list[str] # Token strings for display
+ activations: Float[np.ndarray, " n_ctx"] # ZANJ will save as .npy ref
+ criteria: list[
+ str
+ ] # Which tracking criteria this sample satisfied (e.g., ["max_activation-max-16"])
diff --git a/spd/clustering/dashboard/core/batch_storage.py b/spd/clustering/dashboard/core/batch_storage.py
new file mode 100644
index 000000000..0c754b863
--- /dev/null
+++ b/spd/clustering/dashboard/core/batch_storage.py
@@ -0,0 +1,273 @@
+"""Batch processing storage for accumulating activations."""
+
+from dataclasses import dataclass
+from typing import Any
+
+import numpy as np
+import torch
+from jaxtyping import Float, Int
+from torch import Tensor
+from transformers import PreTrainedTokenizer
+
+from spd.clustering.activations import (
+ ProcessedActivations,
+ component_activations,
+ process_activations,
+)
+from spd.clustering.dashboard.core.base import (
+ ClusterId,
+ ClusterIdHash,
+ TextSample,
+ TextSampleHash,
+)
+from spd.clustering.dashboard.core.compute_helpers import (
+ ClusterActivations,
+ compute_all_cluster_activations,
+)
+from spd.clustering.dashboard.core.tokenization import tokenize_and_create_text_samples
+from spd.models.component_model import ComponentModel
+from spd.utils.general_utils import extract_batch_data
+
+
+@dataclass(slots=True, kw_only=True)
+class BatchProcessingStorage:
+ """Storage for accumulating activations during batch processing.
+
+ Attributes:
+ cluster_activations: Cluster-level activations per cluster
+ cluster_text_hashes: Text hashes for cluster-level activations
+ cluster_tokens: Token strings for cluster-level activations
+ component_activations: Component-level activations per cluster per component
+ component_text_hashes: Text hashes for component-level activations
+ text_samples: All unique text samples encountered
+ all_cluster_activations: Cluster activations for coactivation computation
+ cluster_id_map: Pre-computed mapping from cluster indices to ClusterIds
+ cluster_components: Pre-computed component info for each cluster
+ """
+
+ cluster_activations: dict[ClusterIdHash, list[Float[np.ndarray, " n_ctx"]]]
+ cluster_text_hashes: dict[ClusterIdHash, list[TextSampleHash]]
+ cluster_tokens: dict[ClusterIdHash, list[list[str]]]
+ component_activations: dict[ClusterIdHash, dict[str, list[Float[np.ndarray, " n_ctx"]]]]
+ component_text_hashes: dict[ClusterIdHash, dict[str, list[TextSampleHash]]]
+ text_samples: dict[TextSampleHash, TextSample]
+ all_cluster_activations: list[ClusterActivations]
+ # Pre-computed to avoid recomputation
+ cluster_id_map: dict[int, ClusterId]
+ cluster_components: dict[int, list[dict[str, Any]]]
+ cluster_hash_map: dict[int, ClusterIdHash]
+
+ @classmethod
+ def create(
+ cls,
+ cluster_id_map: dict[int, ClusterId],
+ cluster_components: dict[int, list[dict[str, Any]]],
+ ) -> "BatchProcessingStorage":
+ """Create initialized storage structures.
+
+ Args:
+ cluster_id_map: Mapping from cluster indices to ClusterId objects
+ cluster_components: Component info for each cluster
+
+ Returns:
+ Initialized BatchProcessingStorage object
+ """
+ unique_cluster_indices: list[int] = list(cluster_id_map.keys())
+
+ # Compute cluster hash strings once to avoid redundant .to_string() calls
+ cluster_hashes: dict[int, ClusterIdHash] = {
+ idx: cluster_id_map[idx].to_string() for idx in unique_cluster_indices
+ }
+
+ cluster_activations: dict[ClusterIdHash, list[Float[np.ndarray, " n_ctx"]]] = {
+ cluster_hashes[idx]: [] for idx in unique_cluster_indices
+ }
+ cluster_text_hashes: dict[ClusterIdHash, list[TextSampleHash]] = {
+ cluster_hashes[idx]: [] for idx in unique_cluster_indices
+ }
+ cluster_tokens: dict[ClusterIdHash, list[list[str]]] = {
+ cluster_hashes[idx]: [] for idx in unique_cluster_indices
+ }
+ component_activations: dict[ClusterIdHash, dict[str, list[Float[np.ndarray, " n_ctx"]]]] = {
+ cluster_hashes[idx]: {comp["label"]: [] for comp in cluster_components[idx]}
+ for idx in unique_cluster_indices
+ }
+ component_text_hashes: dict[ClusterIdHash, dict[str, list[TextSampleHash]]] = {
+ cluster_hashes[idx]: {comp["label"]: [] for comp in cluster_components[idx]}
+ for idx in unique_cluster_indices
+ }
+ text_samples: dict[TextSampleHash, TextSample] = {}
+ all_cluster_activations: list[ClusterActivations] = []
+
+ return cls(
+ cluster_activations=cluster_activations,
+ cluster_text_hashes=cluster_text_hashes,
+ cluster_tokens=cluster_tokens,
+ component_activations=component_activations,
+ component_text_hashes=component_text_hashes,
+ text_samples=text_samples,
+ all_cluster_activations=all_cluster_activations,
+ cluster_id_map=cluster_id_map,
+ cluster_components=cluster_components,
+ cluster_hash_map=cluster_hashes,
+ )
+
+ def process_batch(
+ self,
+ batch_data: Any,
+ model: ComponentModel,
+ tokenizer: PreTrainedTokenizer,
+ device: torch.device,
+ ) -> None:
+ """Process a single batch and update storage.
+
+ Args:
+ batch_data: Raw batch data from dataloader
+ model: ComponentModel to get activations from
+ tokenizer: Tokenizer for decoding
+ device: Device for computation
+ sigmoid_type: Sigmoid type for activation computation
+ """
+ # Extract and move batch to device
+ batch: Int[Tensor, "batch_size n_ctx"] = extract_batch_data(batch_data).to(device)
+ batch_size: int
+ seq_len: int
+ batch_size, seq_len = batch.shape
+
+ activations: dict[str, Float[Tensor, "n_steps C"]] = component_activations(
+ model,
+ device,
+ batch=batch,
+ )
+
+ processed: ProcessedActivations = process_activations(
+ activations, seq_mode="concat", filter_dead_threshold=0
+ )
+
+ batch_text_samples: list[TextSample] = tokenize_and_create_text_samples(
+ batch=batch.cpu().numpy(),
+ tokenizer=tokenizer,
+ text_samples=self.text_samples,
+ )
+
+ cluster_acts: ClusterActivations = compute_all_cluster_activations(
+ processed=processed,
+ cluster_components=self.cluster_components,
+ batch_size=batch_size,
+ seq_len=seq_len,
+ )
+
+ self._store_activations(
+ cluster_acts=cluster_acts,
+ processed=processed,
+ batch_text_samples=batch_text_samples,
+ batch_size=batch_size,
+ seq_len=seq_len,
+ )
+
+ def _store_activations(
+ self,
+ cluster_acts: ClusterActivations,
+ processed: ProcessedActivations,
+ batch_text_samples: list[TextSample],
+ batch_size: int,
+ seq_len: int,
+ ) -> None:
+ """Store cluster-level and component-level activations from batch.
+
+ Args:
+ cluster_acts: Computed cluster activations
+ processed: Processed component activations
+ batch_text_samples: TextSample objects for the batch
+ batch_size: Batch size
+ seq_len: Sequence length
+ """
+ # Store for coactivation computation
+ self.all_cluster_activations.append(cluster_acts)
+
+ # Move all GPU→CPU transfers outside loops (CRITICAL OPTIMIZATION)
+ # Reshape cluster activations to [batch_size, seq_len, n_clusters]
+ acts_3d: Float[Tensor, "batch_size seq_len n_clusters"] = cluster_acts.activations.view(
+ batch_size, seq_len, -1
+ )
+ acts_3d_cpu: Float[np.ndarray, "batch_size seq_len n_clusters"] = acts_3d.cpu().numpy()
+
+ # Move component activations to CPU and reshape to [batch_size, seq_len, n_components]
+ processed_acts_cpu: Float[np.ndarray, "batch*seq n_components"] = (
+ processed.activations.cpu().numpy()
+ )
+ processed_acts_3d: Float[np.ndarray, "batch_size seq_len n_components"] = (
+ processed_acts_cpu.reshape(batch_size, seq_len, -1)
+ )
+
+ # Pre-extract text hashes and tokens to avoid repeated lookups
+ batch_text_hashes: list[TextSampleHash] = [
+ sample.text_hash for sample in batch_text_samples
+ ]
+ batch_tokens: list[list[str]] = [sample.tokens for sample in batch_text_samples]
+
+ # Pre-compute component label→index mapping to avoid repeated lookups
+ # Build set of all component labels we'll need
+ all_component_labels: set[str] = set()
+ for cluster_components_list in self.cluster_components.values():
+ for comp_info in cluster_components_list:
+ all_component_labels.add(comp_info["label"])
+
+ # Cache the indices
+ label_to_index: dict[str, int | None] = {
+ label: processed.get_label_index(label) for label in all_component_labels
+ }
+
+ # Filter empty clusters early - find which clusters have non-zero activations
+ active_cluster_mask: Float[np.ndarray, " n_clusters"] = (
+ np.abs(acts_3d_cpu).max(axis=(0, 1)) > 0
+ )
+ active_cluster_indices: list[tuple[int, int]] = [
+ (col_idx, cluster_idx)
+ for col_idx, cluster_idx in enumerate(cluster_acts.cluster_indices)
+ if active_cluster_mask[col_idx]
+ ]
+
+ # Store activations per cluster (only active clusters)
+ for cluster_col_idx, cluster_idx in active_cluster_indices:
+ cluster_acts_2d: Float[np.ndarray, "batch_size seq_len"] = acts_3d_cpu[
+ :, :, cluster_col_idx
+ ]
+
+ # Use pre-computed hash instead of calling .to_string()
+ current_cluster_hash: ClusterIdHash = self.cluster_hash_map[cluster_idx]
+
+ # Get components for this cluster once (move invariant out of batch loop)
+ components_in_cluster: list[dict[str, Any]] = self.cluster_components[cluster_idx]
+
+ # Cache dictionary lookups - get list references once
+ cluster_acts_list = self.cluster_activations[current_cluster_hash]
+ cluster_text_hashes_list = self.cluster_text_hashes[current_cluster_hash]
+ cluster_tokens_list = self.cluster_tokens[current_cluster_hash]
+
+ # Vectorize storage - use .extend() instead of repeated .append()
+ # Store cluster-level activations for entire batch at once
+ cluster_acts_list.extend(cluster_acts_2d[i] for i in range(batch_size))
+ cluster_text_hashes_list.extend(batch_text_hashes)
+ cluster_tokens_list.extend(batch_tokens)
+
+ # Store component-level activations
+ for component_info in components_in_cluster:
+ component_label: str = component_info["label"]
+ comp_idx: int | None = label_to_index[component_label]
+
+ if comp_idx is not None:
+ # Cache nested dictionary lookups
+ comp_acts_list = self.component_activations[current_cluster_hash][
+ component_label
+ ]
+ comp_text_hashes_list = self.component_text_hashes[current_cluster_hash][
+ component_label
+ ]
+
+ # Collect all component activations for this batch
+ batch_comp_acts = [processed_acts_3d[i, :, comp_idx] for i in range(batch_size)]
+
+ # Extend once instead of appending in loop
+ comp_acts_list.extend(batch_comp_acts)
+ comp_text_hashes_list.extend(batch_text_hashes)
diff --git a/spd/clustering/dashboard/core/cluster_data.py b/spd/clustering/dashboard/core/cluster_data.py
new file mode 100644
index 000000000..9a08b2d45
--- /dev/null
+++ b/spd/clustering/dashboard/core/cluster_data.py
@@ -0,0 +1,306 @@
+"""Cluster-specific data structures for dashboard."""
+
+from dataclasses import dataclass
+from typing import Any
+
+import numpy as np
+from jaxtyping import Float
+
+from spd.clustering.consts import SubComponentKey
+from spd.clustering.dashboard.core.base import (
+ _SEPARATOR_1,
+ _SEPARATOR_3,
+ ACTIVATION_SAMPLE_BATCH_STATS,
+ ActivationSampleBatch,
+ ActivationSampleHash,
+ ClusterId,
+ ClusterIdHash,
+ ClusterSample,
+ Direction,
+ TextSampleHash,
+ TrackingCriterionHash,
+)
+
+
+@dataclass(frozen=True, slots=True, kw_only=True)
+class TrackingCriterion:
+ """Defines what statistics to track."""
+
+ property_name: str
+ "max_activation, mean_activation, etc. - must be a property on ActivationSample"
+
+ direction: Direction
+
+ n_samples: int
+
+ def to_tuple(self) -> tuple[str, Direction, int]:
+ """Return as a tuple of (property_name, direction, n_samples)."""
+ return (self.property_name, self.direction, self.n_samples)
+
+ def to_string(self) -> TrackingCriterionHash:
+ """Convert to hash string representation."""
+ parts = (self.property_name, self.direction, str(self.n_samples))
+ assert all(_SEPARATOR_1 not in part for part in parts), "Parts cannot contain separator"
+ return TrackingCriterionHash(_SEPARATOR_1.join(parts))
+
+ @classmethod
+ def from_string(cls, s: TrackingCriterionHash) -> "TrackingCriterion":
+ """Create TrackingCriterion from its string representation."""
+ parts: list[str] = s.split(_SEPARATOR_1)
+ if len(parts) != 3:
+ raise ValueError(f"Invalid TrackingCriterion string: {s}")
+ direction = parts[1]
+ if direction not in ("max", "min"):
+ raise ValueError(f"Invalid direction: {direction}")
+ return cls(
+ property_name=parts[0],
+ direction=direction, # type: ignore[arg-type]
+ n_samples=int(parts[2]),
+ )
+
+
+@dataclass(frozen=True, slots=True, kw_only=True)
+class BinnedData:
+ bin_edges: list[float]
+ bin_counts: list[int]
+
+ @classmethod
+ def from_arr(cls, arr: np.ndarray, n_bins: int) -> "BinnedData":
+ """Create BinnedData from a numpy array."""
+ counts, edges = np.histogram(arr, bins=n_bins)
+ return cls(bin_edges=edges.tolist(), bin_counts=counts.tolist())
+
+
+@dataclass(frozen=True, slots=True, kw_only=True)
+class ClusterData:
+ cluster_hash: ClusterIdHash
+ components: list[SubComponentKey] # Component info: module, index, label
+ samples: list[ClusterSample] # Self-contained samples with text + activations
+ stats: dict[str, Any]
+ # Component-level metrics
+ component_coactivations: Float[np.ndarray, "n_comps n_comps"] | None = None
+ # TODO: Add component_cosine_similarities for U/V vectors when dimension mismatch is resolved
+
+ # DEPRECATED: Can be removed once refactor is complete and JS is updated
+ criterion_samples: dict[TrackingCriterionHash, list[TextSampleHash]] | None = None
+
+ @classmethod
+ def generate(
+ cls,
+ cluster_id: ClusterId,
+ activation_samples: ActivationSampleBatch,
+ criteria: list[TrackingCriterion],
+ components: list[SubComponentKey],
+ hist_bins: int = 10,
+ activation_threshold: float = 0.5,
+ top_n_tokens: int = 50,
+ ) -> "ClusterData":
+ cluster_hash: ClusterIdHash = cluster_id.to_string()
+ stats: dict[str, Any] = dict()
+
+ # Build mapping: text_hash -> (sample_idx, criteria_list)
+ sample_criteria_map: dict[TextSampleHash, list[str]] = {}
+ sample_indices: dict[TextSampleHash, int] = {}
+
+ # DEPRECATED: For backward compatibility during transition
+ criterion_samples_deprecated: dict[TrackingCriterionHash, list[TextSampleHash]] = {}
+
+ # Filter out top-k samples per criterion
+ for criterion in criteria:
+ # Extract property values
+ prop_values: Float[np.ndarray, " batch"] = ACTIVATION_SAMPLE_BATCH_STATS[
+ criterion.property_name
+ ](activation_samples)
+ # Sort by property value
+ reverse: bool = criterion.direction == "max"
+
+ # Zip property values with text hashes and indices
+ samples_with_values: list[tuple[int, TextSampleHash, float]] = [
+ (idx, th, val)
+ for idx, (th, val) in enumerate(
+ zip(activation_samples.text_hashes, prop_values.tolist(), strict=True)
+ )
+ ]
+ samples_with_values.sort(key=lambda x: x[2], reverse=reverse)
+
+ # Take top k
+ top_k: list[tuple[int, TextSampleHash, float]] = samples_with_values[
+ : criterion.n_samples
+ ]
+
+ criterion_str = criterion.to_string()
+
+ # Track which samples satisfy which criteria
+ for idx, text_hash, _ in top_k:
+ sample_criteria_map.setdefault(text_hash, []).append(criterion_str)
+ sample_indices[text_hash] = idx
+
+ # DEPRECATED: Keep for backward compat
+ criterion_samples_deprecated[criterion.to_string()] = [th for _, th, _ in top_k]
+
+ # Add stats
+ stats[criterion_str] = BinnedData.from_arr(
+ prop_values,
+ n_bins=hist_bins,
+ )
+
+ # Add general stats
+ all_activations: Float[np.ndarray, " batch n_ctx"] = activation_samples.activations
+ stats["all_activations"] = BinnedData.from_arr(
+ all_activations.flatten(),
+ n_bins=hist_bins,
+ )
+ stats["n_samples"] = len(activation_samples)
+ stats["n_tokens"] = int(all_activations.size)
+ stats["mean_activation"] = float(np.mean(all_activations))
+ stats["min_activation"] = float(np.min(all_activations))
+ stats["max_activation"] = float(np.max(all_activations))
+ stats["median_activation"] = float(np.median(all_activations))
+
+ # Compute max activation position distribution (how concentrated are the max activations)
+ # For each sample, find the position (index) where max activation occurs
+ max_positions: np.ndarray = np.argmax(all_activations, axis=1) # shape: (batch,)
+ # Normalize positions to [0, 1] range
+ n_ctx: int = all_activations.shape[1]
+ normalized_positions: np.ndarray = max_positions.astype(float) / max(1, n_ctx - 1)
+
+ # Sanity check: positions should always be in [0, 1]
+ assert normalized_positions.min() >= 0, f"Position min={normalized_positions.min()} < 0"
+ assert normalized_positions.max() <= 1, f"Position max={normalized_positions.max()} > 1"
+
+ stats["max_activation_position"] = BinnedData.from_arr(
+ normalized_positions,
+ n_bins=hist_bins,
+ )
+
+ # Token-level activation statistics
+ if activation_samples.tokens is not None:
+ from collections import Counter
+
+ # Count activations per token above threshold
+ token_activation_counts: Counter[str] = Counter()
+
+ for sample_idx, token_list in enumerate(activation_samples.tokens):
+ sample_activations = all_activations[sample_idx] # shape: (n_ctx,)
+
+ for token_idx, token in enumerate(token_list):
+ if sample_activations[token_idx] > activation_threshold:
+ token_activation_counts[token] += 1
+
+ # Get top N most frequently activated tokens
+ top_tokens: list[tuple[str, int]] = token_activation_counts.most_common(top_n_tokens)
+
+ # Distribution statistics
+ total_unique_tokens: int = len(token_activation_counts)
+ total_activations: int = sum(token_activation_counts.values())
+
+ # Compute concentration metrics
+ if total_activations > 0 and total_unique_tokens > 0:
+ # Entropy: measures how evenly distributed activations are across tokens
+ counts_array: np.ndarray = np.array(list(token_activation_counts.values()))
+ probs: np.ndarray = counts_array / total_activations
+ entropy: float = float(-np.sum(probs * np.log(probs + 1e-10)))
+
+ # Concentration ratio: fraction of activations in top 10% of tokens
+ top_10pct_count: int = max(1, total_unique_tokens // 10)
+ top_10pct_activations: int = sum(
+ count for _, count in token_activation_counts.most_common(top_10pct_count)
+ )
+ concentration_ratio: float = (
+ top_10pct_activations / total_activations if total_activations > 0 else 0.0
+ )
+ else:
+ entropy = 0.0
+ concentration_ratio = 0.0
+
+ stats["token_activations"] = {
+ "top_tokens": [{"token": token, "count": count} for token, count in top_tokens],
+ "total_unique_tokens": total_unique_tokens,
+ "total_activations": total_activations,
+ "entropy": entropy,
+ "concentration_ratio": concentration_ratio,
+ "activation_threshold": activation_threshold,
+ }
+
+ # Build self-contained ClusterSample objects
+ samples: list[ClusterSample] = []
+ for text_hash, criteria_list in sample_criteria_map.items():
+ sample_idx = sample_indices[text_hash]
+ samples.append(
+ ClusterSample(
+ text_hash=str(text_hash), # pyright: ignore[reportCallIssue]
+ tokens=activation_samples.tokens[sample_idx] # pyright: ignore[reportCallIssue]
+ if activation_samples.tokens
+ else [],
+ activations=activation_samples.activations[sample_idx], # pyright: ignore[reportCallIssue]
+ criteria=criteria_list, # pyright: ignore[reportCallIssue]
+ )
+ )
+
+ return cls(
+ cluster_hash=cluster_hash,
+ components=components,
+ samples=samples,
+ stats=stats,
+ criterion_samples=criterion_samples_deprecated, # DEPRECATED
+ )
+
+ def get_unique_text_hashes(self) -> set[TextSampleHash]:
+ """Get all unique text hashes across all criteria."""
+ unique_hashes: set[TextSampleHash] = set()
+ assert self.criterion_samples is not None, "criterion_samples is None"
+ for hashes in self.criterion_samples.values():
+ unique_hashes.update(hashes)
+ return unique_hashes
+
+ def get_unique_activation_hashes(self) -> set[ActivationSampleHash]:
+ """Get all unique activation hashes across all criteria."""
+ unique_hashes: set[ActivationSampleHash] = set()
+ cluster_str = self.cluster_hash
+ assert self.criterion_samples is not None, "criterion_samples is None"
+ for hashes in self.criterion_samples.values():
+ unique_hashes.update(
+ ActivationSampleHash(f"{cluster_str}{_SEPARATOR_3}{th}") for th in hashes
+ )
+ return unique_hashes
+
+ def serialize(self) -> dict[str, Any]:
+ """Serialize to a dictionary."""
+ # Convert stats to JSON-compatible format
+ serialized_stats: dict[str, Any] = {}
+ for key, value in self.stats.items():
+ if isinstance(value, BinnedData):
+ serialized_stats[key] = {
+ "bin_edges": value.bin_edges,
+ "bin_counts": value.bin_counts,
+ }
+ elif isinstance(value, int | float):
+ serialized_stats[key] = value
+ elif isinstance(value, np.ndarray):
+ serialized_stats[key] = value.tolist()
+ else:
+ serialized_stats[key] = value
+
+ result: dict[str, Any] = {
+ "cluster_hash": self.cluster_hash,
+ "components": [
+ {"module": comp.module, "index": comp.index, "label": comp.label}
+ for comp in self.components
+ ],
+ # Serialize samples (ClusterSample has .serialize() from SerializableDataclass)
+ "samples": [sample.serialize() for sample in self.samples],
+ "stats": serialized_stats,
+ }
+
+ # DEPRECATED: Keep for backward compat during transition
+ if self.criterion_samples is not None:
+ result["criterion_samples"] = {
+ str(k): [str(h) for h in v] for k, v in self.criterion_samples.items()
+ }
+
+ # Add component-level metrics if available
+ if self.component_coactivations is not None:
+ result["component_coactivations"] = self.component_coactivations.tolist()
+ # TODO: Serialize component_cosine_similarities when implemented
+
+ return result
diff --git a/spd/clustering/dashboard/core/compute_helpers.py b/spd/clustering/dashboard/core/compute_helpers.py
new file mode 100644
index 000000000..acb2aad1e
--- /dev/null
+++ b/spd/clustering/dashboard/core/compute_helpers.py
@@ -0,0 +1,287 @@
+"""Helper functions for computing cluster and component activations."""
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+import torch
+from jaxtyping import Float
+from torch import Tensor
+
+from spd.clustering.activations import ProcessedActivations
+
+if TYPE_CHECKING:
+ from spd.models.component_model import ComponentModel
+
+
+@dataclass(slots=True, kw_only=True)
+class ClusterActivations:
+ """Vectorized cluster activations for all clusters.
+
+ Attributes:
+ activations: Tensor of shape [n_samples, n_clusters] containing cluster activations
+ cluster_indices: List mapping column index to cluster index
+ """
+
+ activations: Float[Tensor, "n_samples n_clusters"]
+ cluster_indices: list[int]
+
+
+def compute_all_cluster_activations(
+ processed: ProcessedActivations,
+ cluster_components: dict[int, list[dict[str, Any]]],
+ batch_size: int,
+ seq_len: int,
+) -> ClusterActivations:
+ """Compute activations for all clusters in a vectorized manner.
+
+ Args:
+ processed: ProcessedActivations containing all component activations
+ cluster_components: Dict mapping cluster_idx -> list of component info dicts
+ batch_size: Batch size
+ seq_len: Sequence length
+
+ Returns:
+ ClusterActivations with shape [batch_size * seq_len, n_clusters]
+ """
+ cluster_indices: list[int] = sorted(cluster_components.keys())
+ n_clusters: int = len(cluster_indices)
+ n_samples: int = batch_size * seq_len
+ device: torch.device = processed.activations.device
+
+ # Build cluster activation tensor: [n_samples, n_clusters]
+ cluster_acts: Float[Tensor, "n_samples n_clusters"] = torch.zeros(
+ (n_samples, n_clusters), device=device
+ )
+
+ # For each cluster, compute max activation across its components
+ for cluster_col_idx, cluster_idx in enumerate(cluster_indices):
+ components: list[dict[str, Any]] = cluster_components[cluster_idx]
+
+ # Get component indices for this cluster
+ comp_indices: list[int] = []
+ for component_info in components:
+ label: str = component_info["label"]
+ comp_idx: int | None = processed.get_label_index(label)
+ if comp_idx is not None:
+ comp_indices.append(comp_idx)
+
+ if not comp_indices:
+ continue
+
+ # Max activation across components for this cluster
+ # processed.activations: [n_samples, n_components]
+ cluster_acts[:, cluster_col_idx] = processed.activations[:, comp_indices].max(dim=1).values
+
+ return ClusterActivations(activations=cluster_acts, cluster_indices=cluster_indices)
+
+
+def compute_cluster_coactivations(
+ cluster_activations_list: list[ClusterActivations],
+) -> tuple[Float[np.ndarray, "n_clusters n_clusters"], list[int]]:
+ """Compute coactivation matrix for clusters across all batches.
+
+ Args:
+ cluster_activations_list: List of ClusterActivations from each batch
+
+ Returns:
+ Tuple of (coactivation_matrix, cluster_indices) where coact[i,j] is the number
+ of samples where both cluster i and j activate
+ """
+ if not cluster_activations_list:
+ return np.array([[]], dtype=np.float32), []
+
+ # All batches should have same cluster indices
+ cluster_indices: list[int] = cluster_activations_list[0].cluster_indices
+ _n_clusters: int = len(cluster_indices)
+
+ # Concatenate all batch activations: [total_samples, n_clusters]
+ all_acts: Float[Tensor, "total_samples n_clusters"] = torch.cat(
+ [ca.activations for ca in cluster_activations_list], dim=0
+ )
+
+ # Binarize activations (1 if cluster activates, 0 otherwise)
+ activation_mask: Float[Tensor, "total_samples n_clusters"] = (all_acts > 0).float()
+
+ # Compute coactivation matrix: coact[i,j] = sum over samples of (cluster_i_active * cluster_j_active)
+ # Following spd/clustering/merge.py:69
+ coact: Float[Tensor, "n_clusters n_clusters"] = activation_mask.T @ activation_mask
+
+ return coact.cpu().numpy(), cluster_indices
+
+
+def compute_component_coactivations_in_cluster(
+ processed: ProcessedActivations,
+ component_labels: list[str],
+ activation_threshold: float = 0.0,
+) -> Float[np.ndarray, "n_comps n_comps"]:
+ """Compute coactivation matrix for components within a cluster.
+
+ Args:
+ processed: ProcessedActivations containing all component activations
+ component_labels: List of component labels in this cluster
+ activation_threshold: Threshold for considering a component "active"
+
+ Returns:
+ Coactivation matrix where coact[i,j] is the number of samples where
+ both component i and j activate above threshold
+ """
+ n_components: int = len(component_labels)
+
+ # Get indices for these components
+ comp_indices: list[int] = []
+ for label in component_labels:
+ comp_idx: int | None = processed.get_label_index(label)
+ if comp_idx is not None:
+ comp_indices.append(comp_idx)
+
+ if not comp_indices:
+ return np.zeros((n_components, n_components), dtype=np.float32)
+
+ # Extract activations for these components: [n_samples, n_components]
+ component_acts: Float[Tensor, "n_samples n_comps"] = processed.activations[:, comp_indices]
+
+ # Binarize activations (1 if component activates above threshold, 0 otherwise)
+ activation_mask: Float[Tensor, "n_samples n_comps"] = (
+ component_acts > activation_threshold
+ ).float()
+
+ # Compute coactivation matrix: coact[i,j] = sum over samples of (comp_i_active * comp_j_active)
+ coact: Float[Tensor, "n_comps n_comps"] = activation_mask.T @ activation_mask
+
+ return coact.cpu().numpy()
+
+
+def compute_component_cosine_similarities(
+ model: "ComponentModel", # pyright: ignore[reportUnusedParameter]
+ component_labels: list[str], # pyright: ignore[reportUnusedParameter]
+) -> Float[np.ndarray, "2 n_comps n_comps"]:
+ """Compute cosine similarity matrices for U and V vectors of components.
+
+ NOTE: Not implemented - U and V vectors may have different dimensions across components.
+
+ Args:
+ model: ComponentModel containing the components with U and V matrices
+ component_labels: List of component labels in format "module_name:component_idx"
+
+ Returns:
+ Array of shape (2, n_comps, n_comps) where:
+ - [0, i, j] is cosine similarity between U vectors of components i and j
+ - [1, i, j] is cosine similarity between V vectors of components i and j
+ """
+ raise NotImplementedError(
+ "compute_component_cosine_similarities is not implemented - "
+ "U and V vectors may have different dimensions across components"
+ )
+
+ # Code kept for reference:
+ # n_components: int = len(component_labels)
+ #
+ # if n_components == 0:
+ # return np.zeros((2, 0, 0), dtype=np.float32)
+ #
+ # # Parse labels and extract U and V vectors
+ # u_vectors: list[Tensor] = []
+ # v_vectors: list[Tensor] = []
+ #
+ # for label in component_labels:
+ # # Parse label format: "module_name:component_idx"
+ # module_name, comp_idx_str = label.rsplit(":", 1)
+ # comp_idx: int = int(comp_idx_str)
+ #
+ # # Get the Components object for this module
+ # if module_name not in model.components:
+ # raise ValueError(f"Module '{module_name}' not found in model.components")
+ #
+ # components = model.components[module_name]
+ #
+ # # Extract U vector: components.U[comp_idx, :] shape (u_dim,)
+ # u_vec: Tensor = components.U[comp_idx, :]
+ # u_vectors.append(u_vec)
+ #
+ # # Extract V vector: components.V[:, comp_idx] shape (v_dim,)
+ # v_vec: Tensor = components.V[:, comp_idx]
+ # v_vectors.append(v_vec)
+ #
+ # # Stack vectors: [n_comps, u_dim] and [n_comps, v_dim]
+ # u_matrix: Tensor = torch.stack(u_vectors, dim=0) # [n_comps, u_dim]
+ # v_matrix: Tensor = torch.stack(v_vectors, dim=0) # [n_comps, v_dim]
+ #
+ # # Compute cosine similarities for U vectors
+ # u_norms: Tensor = torch.norm(u_matrix, p=2, dim=1) # [n_comps]
+ # u_norms = torch.where(u_norms > 0, u_norms, torch.ones_like(u_norms))
+ # u_normalized: Tensor = u_matrix / u_norms.unsqueeze(1) # [n_comps, u_dim]
+ # u_cosine_sim: Tensor = u_normalized @ u_normalized.T # [n_comps, n_comps]
+ #
+ # # Compute cosine similarities for V vectors
+ # v_norms: Tensor = torch.norm(v_matrix, p=2, dim=1) # [n_comps]
+ # v_norms = torch.where(v_norms > 0, v_norms, torch.ones_like(v_norms))
+ # v_normalized: Tensor = v_matrix / v_norms.unsqueeze(1) # [n_comps, v_dim]
+ # v_cosine_sim: Tensor = v_normalized @ v_normalized.T # [n_comps, n_comps]
+ #
+ # # Stack and return: [2, n_comps, n_comps]
+ # result: np.ndarray = np.stack(
+ # [u_cosine_sim.detach().cpu().numpy(), v_cosine_sim.detach().cpu().numpy()], axis=0
+ # )
+ #
+ # return result
+
+
+@dataclass(frozen=True, slots=True, kw_only=True)
+class ComponentMetrics:
+ """Combined metrics for components within a cluster.
+
+ Attributes:
+ coactivations: Matrix where coact[i,j] is count of samples where both i and j activate
+
+ TODO: Add cosine_similarities field for U/V vectors when dimension mismatch is resolved
+ """
+
+ coactivations: Float[np.ndarray, "n_comps n_comps"]
+
+
+def compute_component_metrics_from_storage(
+ component_labels: list[str],
+ component_activations: dict[str, list[Float[np.ndarray, " n_ctx"]]],
+) -> ComponentMetrics | None:
+ """Compute coactivations from stored component activations.
+
+ Args:
+ component_labels: List of component labels in this cluster
+ component_activations: Dict mapping component labels to their activation lists
+
+ Returns:
+ ComponentMetrics with coactivations, or None if insufficient data
+
+ TODO: Add cosine similarities for U/V vectors when dimension mismatch is resolved
+ """
+ if not component_labels:
+ return None
+
+ n_comps: int = len(component_labels)
+
+ # Build activation matrix for all components: [n_comps, n_total_samples]
+ comp_act_matrix_list: list[Float[np.ndarray, "n_samples n_ctx"]] = []
+ for comp_label in component_labels:
+ if comp_label in component_activations:
+ comp_acts_list = component_activations[comp_label]
+ if comp_acts_list:
+ comp_act_matrix_list.append(np.stack(comp_acts_list))
+
+ if not comp_act_matrix_list or len(comp_act_matrix_list) != n_comps:
+ # Return zero matrices if not all components have data
+ return ComponentMetrics(
+ coactivations=np.zeros((n_comps, n_comps), dtype=np.float32),
+ )
+
+ # Flatten to [n_total] per component, then stack to [n_comps, n_total]
+ comp_act_flat: list[Float[np.ndarray, " n_total"]] = [
+ arr.flatten() for arr in comp_act_matrix_list
+ ]
+ comp_act_matrix: Float[np.ndarray, "n_comps n_total"] = np.stack(comp_act_flat, axis=0)
+
+ # Compute coactivations (binarized)
+ comp_act_bin: Float[np.ndarray, "n_comps n_total"] = (comp_act_matrix > 0).astype(np.float32)
+ coactivations: Float[np.ndarray, "n_comps n_comps"] = comp_act_bin @ comp_act_bin.T
+
+ return ComponentMetrics(coactivations=coactivations)
diff --git a/spd/clustering/dashboard/core/compute_max_act.py b/spd/clustering/dashboard/core/compute_max_act.py
new file mode 100644
index 000000000..95cc7ba4c
--- /dev/null
+++ b/spd/clustering/dashboard/core/compute_max_act.py
@@ -0,0 +1,126 @@
+"""Core computation logic for finding max-activating text samples."""
+
+from typing import Any
+
+import torch
+from muutils.spinner import SpinnerContext
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import PreTrainedTokenizer
+
+from spd.clustering.dashboard.core import (
+ BatchProcessingStorage,
+ ClusterId,
+ ClusterLabel,
+ DashboardData,
+ TrackingCriterion,
+ compute_cluster_coactivations,
+)
+from spd.clustering.merge_history import MergeHistory
+from spd.models.component_model import ComponentModel
+from spd.utils.general_utils import get_obj_device
+
+
+def compute_max_activations(
+ model: ComponentModel,
+ tokenizer: PreTrainedTokenizer,
+ dataloader: DataLoader[Any],
+ merge_history: MergeHistory,
+ iteration: int,
+ n_samples: int,
+ n_batches: int,
+ clustering_run: str,
+) -> DashboardData:
+ """Compute max-activating text samples for each cluster and coactivation matrix.
+
+ Args:
+ model: ComponentModel to get activations from
+ sigmoid_type: Sigmoid type for activation computation
+ tokenizer: Tokenizer for decoding text
+ dataloader: DataLoader providing batches
+ merge_history: MergeHistory containing cluster information
+ iteration: Merge iteration to analyze
+ n_samples: Number of top samples to track per cluster
+ n_batches: Number of batches to process
+ clustering_run: Clustering run identifier
+
+ Returns:
+ DashboardData with all cluster data, text samples, and coactivation information
+ """
+ device: torch.device = get_obj_device(model)
+
+ # Setup: Get cluster info and create ClusterId objects
+ unique_MH_cluster_indices: list[int] = merge_history.get_unique_clusters(iteration)
+ cluster_components: dict[int, list[dict[str, Any]]] = {
+ cid: merge_history.get_cluster_components_info(iteration, cid)
+ for cid in unique_MH_cluster_indices
+ }
+
+ cluster_id_map: dict[int, ClusterId] = {}
+ for idx in unique_MH_cluster_indices:
+ components: list[dict[str, Any]] = cluster_components[idx]
+ assert components, f"Cluster {idx} has no components"
+ cluster_id_map[idx] = ClusterId(
+ clustering_run=clustering_run,
+ iteration=iteration,
+ cluster_label=ClusterLabel(idx),
+ )
+
+ criteria: list[TrackingCriterion] = [
+ TrackingCriterion(
+ property_name="max_activation",
+ direction="max",
+ n_samples=n_samples,
+ )
+ ]
+
+ # Initialize storage
+ storage: BatchProcessingStorage = BatchProcessingStorage.create(
+ cluster_id_map, cluster_components
+ )
+
+ # Process batches
+ print(f"\nProcessing {n_batches} batches...")
+ for batch_idx, batch_data in enumerate(dataloader):
+ if batch_idx >= n_batches:
+ break
+
+ with SpinnerContext(
+ message=f" Batch {batch_idx + 1}/{n_batches}",
+ format_string=" \r{spinner} ({elapsed_time:.2f}s) {message}{value}",
+ update_interval=0.33,
+ ):
+ storage.process_batch(
+ batch_data=batch_data,
+ model=model,
+ tokenizer=tokenizer,
+ device=device,
+ )
+
+ # Create DashboardData and add clusters incrementally
+ dashboard: DashboardData = DashboardData.create(text_samples=storage.text_samples)
+
+ for cluster_idx in tqdm(unique_MH_cluster_indices, desc="Building cluster data"):
+ cluster_id: ClusterId = cluster_id_map[cluster_idx]
+ cluster_hash = cluster_id.to_string()
+
+ dashboard.add_cluster(
+ cluster_id=cluster_id,
+ cluster_components=cluster_components[cluster_idx],
+ criteria=criteria,
+ cluster_activations=storage.cluster_activations[cluster_hash],
+ cluster_text_hashes=storage.cluster_text_hashes[cluster_hash],
+ cluster_tokens=storage.cluster_tokens[cluster_hash],
+ )
+
+ # Compute coactivations
+ with SpinnerContext(message="Computing cluster coactivations"):
+ coactivations, cluster_indices = compute_cluster_coactivations(
+ storage.all_cluster_activations
+ )
+
+ # Attach coactivation data to dashboard
+ dashboard.coactivations = coactivations
+ dashboard.cluster_indices = cluster_indices
+
+ return dashboard
diff --git a/spd/clustering/dashboard/core/dashboard_config.py b/spd/clustering/dashboard/core/dashboard_config.py
new file mode 100644
index 000000000..9a35a5e12
--- /dev/null
+++ b/spd/clustering/dashboard/core/dashboard_config.py
@@ -0,0 +1,44 @@
+"""CLI script for computing max-activating text samples for language model component clusters."""
+
+from pathlib import Path
+
+from pydantic import Field
+
+from spd.base_config import BaseConfig
+from spd.settings import REPO_ROOT
+
+
+class DashboardConfig(BaseConfig):
+ wandb_run: str = Field(description="WandB clustering run path (e.g., entity/project/run_id)")
+ output_dir: Path = Field(
+ default=REPO_ROOT / "spd/clustering/dashboard/data",
+ description="Base output directory (default: {{REPO_ROOT}}/spd/clustering/dashboard/data/)",
+ )
+ iteration: int = Field(
+ default=-1,
+ description="Merge iteration to analyze (negative indexes from end, default: -1 for latest)",
+ )
+ n_samples: int = Field(
+ default=16,
+ description="Number of top-activating samples to collect per cluster",
+ )
+ n_batches: int = Field(
+ default=4,
+ description="Number of data batches to process",
+ )
+ batch_size: int = Field(
+ default=64,
+ description="Batch size for data loading",
+ )
+ context_length: int = Field(
+ default=64,
+ description="Context length for tokenization",
+ )
+ write_html: bool = Field(
+ default=False,
+ description="Write bundled HTML files to output directory",
+ )
+ dataset_streaming: bool = Field(
+ default=False,
+ description="Whether to use streaming dataset loading. recommended True for large datasets or tests.",
+ )
diff --git a/spd/clustering/dashboard/core/dashboard_data.py b/spd/clustering/dashboard/core/dashboard_data.py
new file mode 100644
index 000000000..a9351f92c
--- /dev/null
+++ b/spd/clustering/dashboard/core/dashboard_data.py
@@ -0,0 +1,103 @@
+"""Top-level dashboard data structure."""
+
+from typing import Any
+
+import numpy as np
+from jaxtyping import Float
+from muutils.json_serialize import SerializableDataclass, serializable_dataclass, serializable_field
+
+from spd.clustering.consts import SubComponentKey
+from spd.clustering.dashboard.core.base import (
+ ActivationSampleBatch,
+ ClusterId,
+ ClusterIdHash,
+ TextSample,
+ TextSampleHash,
+)
+from spd.clustering.dashboard.core.cluster_data import ClusterData, TrackingCriterion
+
+
+@serializable_dataclass(kw_only=True) # pyright: ignore[reportUntypedClassDecorator]
+class DashboardData(SerializableDataclass):
+ """All data for the dashboard.
+
+ Self-contained data structure that ZANJ can save/load automatically.
+ Clusters contain their own activation data (no external references needed).
+ """
+
+ model_info: dict[str, Any] = serializable_field(default_factory=dict)
+ clusters: dict[ClusterIdHash, ClusterData] = serializable_field(default_factory=dict)
+ text_samples: dict[TextSampleHash, TextSample] = serializable_field(default_factory=dict)
+
+ # Optional global metrics
+ coactivations: Float[np.ndarray, "n_clusters n_clusters"] | None = serializable_field(
+ default=None
+ )
+ cluster_indices: list[int] | None = serializable_field(default=None)
+
+ @classmethod
+ def create(cls, text_samples: dict[TextSampleHash, TextSample]) -> "DashboardData":
+ """Initialize empty dashboard data.
+
+ Args:
+ text_samples: Text samples dict to share across clusters
+
+ Returns:
+ Empty DashboardData ready for incremental population
+ """
+ return cls(text_samples=text_samples) # pyright: ignore[reportCallIssue]
+
+ def add_cluster(
+ self,
+ cluster_id: ClusterId,
+ cluster_components: list[dict[str, Any]],
+ criteria: list[TrackingCriterion],
+ cluster_activations: list[Float[np.ndarray, " n_ctx"]],
+ cluster_text_hashes: list[TextSampleHash],
+ cluster_tokens: list[list[str]],
+ ) -> None:
+ """Build and add a cluster to the dashboard.
+
+ Args:
+ cluster_id: ClusterId object
+ cluster_components: Component info for this cluster
+ criteria: Tracking criteria for top-k samples
+ cluster_activations: List of activation arrays for this cluster
+ cluster_text_hashes: List of text hashes for cluster activations
+ cluster_tokens: List of token strings for cluster activations
+ """
+ cluster_hash: ClusterIdHash = cluster_id.to_string()
+
+ # Skip if cluster has no activations
+ if not cluster_activations:
+ return
+
+ # Stack cluster-level activations into batch
+ acts_array: Float[np.ndarray, "batch n_ctx"] = np.stack(cluster_activations)
+
+ activation_batch: ActivationSampleBatch = ActivationSampleBatch(
+ cluster_id=cluster_id,
+ text_hashes=cluster_text_hashes,
+ activations=acts_array,
+ tokens=cluster_tokens,
+ )
+
+ # Convert component info to ComponentInfo objects
+ components_info: list[SubComponentKey] = [
+ SubComponentKey(module=comp["module"], index=comp["index"])
+ for comp in cluster_components
+ ]
+
+ # Generate ClusterData with stats and top-k samples (now self-contained!)
+ cluster_data: ClusterData = ClusterData.generate(
+ cluster_id=cluster_id,
+ activation_samples=activation_batch,
+ criteria=criteria,
+ components=components_info,
+ )
+
+ # Store cluster (activations are now embedded in cluster_data.samples!)
+ self.clusters[cluster_hash] = cluster_data
+
+ # DEPRECATED: All methods below can be removed - ZANJ handles serialization automatically
+ # The .serialize() method is provided by SerializableDataclass
diff --git a/spd/clustering/dashboard/core/dashboard_io.py b/spd/clustering/dashboard/core/dashboard_io.py
new file mode 100644
index 000000000..e7d63bca3
--- /dev/null
+++ b/spd/clustering/dashboard/core/dashboard_io.py
@@ -0,0 +1,168 @@
+"""I/O utilities for dashboard: WandB artifacts, model setup, and result generation."""
+
+from pathlib import Path
+from typing import Any
+
+import torch
+import wandb
+from torch.utils.data import DataLoader
+from transformers import AutoTokenizer, PreTrainedTokenizer
+from wandb.apis.public import Run
+
+from spd.clustering.consts import SubComponentKey, SubComponentLabel
+from spd.clustering.dashboard.core.tokenization import attach_vocab_arr
+from spd.clustering.math.merge_matrix import GroupMerge
+from spd.clustering.merge_history import MergeHistory
+from spd.clustering.scripts.run_clustering import ClusteringRunStorage
+from spd.configs import Config
+from spd.data import DatasetConfig, create_data_loader
+from spd.log import logger
+from spd.models.component_model import ComponentModel, SPDRunInfo
+
+
+def load_wandb_artifacts(wandb_path: str) -> tuple[MergeHistory, dict[str, Any], str]:
+ """Download and load WandB artifacts.
+
+ Args:
+ wandb_path: WandB run path (e.g., entity/project/run_id)
+
+ Returns:
+ Tuple of (MergeHistory, run_config_dict, run_id)
+ """
+ api: wandb.Api = wandb.Api()
+ run: Run = api.run(wandb_path)
+ logger.info(f"Loaded WandB run: {run.name} ({run.id})")
+
+ # Download merge history artifact
+ logger.info("Downloading merge history artifact...")
+ artifacts: list[Any] = [a for a in run.logged_artifacts() if a.type == "merge_history"]
+ if not artifacts:
+ raise ValueError(f"No merge_history artifacts found in run {wandb_path}")
+ artifact: Any = artifacts[0]
+ logger.info(f"Found artifact: {artifact.name}")
+
+ artifact_dir: str = artifact.download()
+ merge_history_path: Path = Path(artifact_dir) / ClusteringRunStorage._HISTORY
+ merge_history: MergeHistory = MergeHistory.read(merge_history_path)
+ logger.info(f"Loaded merge history: {merge_history}")
+
+ return merge_history, run.config, run.id
+
+
+def setup_model_and_data(
+ run_config: dict[str, Any],
+ context_length: int,
+ batch_size: int,
+ streaming: bool = False,
+) -> tuple[ComponentModel, PreTrainedTokenizer, DataLoader[Any], Config]:
+ """Set up model, tokenizer, and dataloader.
+
+ Args:
+ run_config: WandB run config dictionary
+ context_length: Context length for tokenization
+ batch_size: Batch size for data loading
+
+ Returns:
+ Tuple of (model, tokenizer, dataloader, spd_config)
+ """
+ device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+ # Load model
+ model_path: str = run_config["model_path"]
+ logger.info(f"Loading model from: {model_path}")
+ spd_run: SPDRunInfo = SPDRunInfo.from_path(model_path)
+ model: ComponentModel = ComponentModel.from_run_info(spd_run)
+ model.to(device)
+ model.eval()
+ config: Config = spd_run.config
+ tokenizer_name: str = config.tokenizer_name # pyright: ignore[reportAssignmentType]
+ logger.info(f"{tokenizer_name = }")
+
+ # Load tokenizer
+ tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+ logger.info(f"Loaded: {tokenizer = }")
+
+ # Attach vocab array for optimized batch decoding
+ attach_vocab_arr(tokenizer)
+ logger.info("Attached vocab array to tokenizer for optimized decoding")
+
+ # Create dataloader
+ # TODO: read this from batches_config.json
+ dataset_config: DatasetConfig = DatasetConfig(
+ name="SimpleStories/SimpleStories",
+ hf_tokenizer_path=tokenizer_name,
+ split="train",
+ n_ctx=context_length,
+ is_tokenized=False, # Text dataset
+ streaming=streaming,
+ column_name="story",
+ )
+ logger.info(f"Using {dataset_config = }")
+
+ dataloader: DataLoader[Any]
+ dataloader, _ = create_data_loader(
+ dataset_config=dataset_config,
+ batch_size=batch_size,
+ buffer_size=4,
+ ddp_rank=0,
+ ddp_world_size=1,
+ )
+ logger.info(f"Created {dataloader = }")
+
+ return model, tokenizer, dataloader, config
+
+
+def generate_model_info(
+ model: ComponentModel,
+ merge_history: MergeHistory,
+ merge: GroupMerge,
+ iteration: int,
+ model_path: str,
+ tokenizer_name: str,
+ config_dict: dict[str, Any] | None = None,
+ wandb_clustering_run: str | None = None,
+) -> dict[str, Any]:
+ """Generate model information dictionary.
+
+ Args:
+ model: The ComponentModel instance
+ merge_history: MergeHistory containing component labels
+ merge: GroupMerge for the current iteration
+ iteration: Current iteration number
+ model_path: Path to the model
+ tokenizer_name: Name of the tokenizer
+ config_dict: Optional config dictionary
+ wandb_run_path: Optional wandb run path
+
+ Returns:
+ Dictionary containing model information
+ """
+ # Count unique modules from all components in the merge history
+ unique_modules: set[str] = set()
+ total_components: int = len(merge_history.labels)
+
+ for label in merge_history.labels:
+ comp: SubComponentKey = SubComponentKey.from_label(SubComponentLabel(label))
+ unique_modules.add(comp.module)
+
+ # Count parameters in the model
+ total_params: int = sum(p.numel() for p in model.parameters())
+ trainable_params: int = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ # Create model info dictionary
+ model_info: dict[str, Any] = {
+ "total_modules": len(unique_modules),
+ "total_components": total_components,
+ "total_clusters": len(torch.unique(merge.group_idxs)),
+ "iteration": iteration,
+ "model_path": model_path,
+ "tokenizer_name": tokenizer_name,
+ "total_parameters": total_params,
+ "trainable_parameters": trainable_params,
+ "component_size": getattr(model, "C", None),
+ "module_list": sorted(list(unique_modules)),
+ "config": config_dict,
+ "wandb_clustering_run": wandb_clustering_run,
+ }
+
+ return model_info
diff --git a/spd/clustering/dashboard/core/tokenization.py b/spd/clustering/dashboard/core/tokenization.py
new file mode 100644
index 000000000..668c3523b
--- /dev/null
+++ b/spd/clustering/dashboard/core/tokenization.py
@@ -0,0 +1,81 @@
+"""Text processing utilities for dashboard data generation."""
+
+import numpy as np
+from jaxtyping import Int
+from transformers import PreTrainedTokenizer
+
+from spd.clustering.dashboard.core.base import TextSample, TextSampleHash
+
+# TODO: pyright.... hates tokenizers???
+# pyright: reportAttributeAccessIssue=false, reportUnknownParameterType=false
+
+
+def attach_vocab_arr(tokenizer: PreTrainedTokenizer) -> None:
+ """Attach a numpy array of token strings to the tokenizer for fast batch decoding.
+
+ Creates a vocab_arr attribute containing all tokens as unicode strings,
+ enabling O(1) array indexing instead of repeated convert_ids_to_tokens calls.
+ """
+ vocab_size: int = tokenizer.vocab_size
+ vocab_list: list[str] = [tokenizer.convert_ids_to_tokens(i) for i in range(vocab_size)]
+ max_token_length: int = max(len(token) for token in vocab_list)
+ print(f"{max_token_length = }")
+ vocab_arr: np.ndarray = np.array(
+ vocab_list,
+ dtype=f"U{max_token_length}", # Unicode strings, not bytes
+ )
+ tokenizer.vocab_arr = vocab_arr # type: ignore[attr-defined]
+
+
+def simple_batch_decode(
+ tokenizer: PreTrainedTokenizer,
+ batch: Int[np.ndarray, "batch_size n_ctx"],
+) -> np.ndarray:
+ """Decode a batch of token IDs to their string representations.
+
+ Args:
+ tokenizer: PreTrainedTokenizer with vocab_arr attached
+ batch: Token IDs array of shape (batch_size, n_ctx)
+
+ Returns:
+ Array of shape (batch_size, n_ctx) containing unicode token strings
+ """
+ assert hasattr(tokenizer, "vocab_arr"), (
+ "Tokenizer missing vocab_arr attribute, call attach_vocab_arr first"
+ )
+ return tokenizer.vocab_arr[batch]
+
+
+def tokenize_and_create_text_samples(
+ batch: Int[np.ndarray, "batch_size n_ctx"],
+ tokenizer: PreTrainedTokenizer,
+ text_samples: dict[TextSampleHash, TextSample],
+) -> list[TextSample]:
+ """Tokenize batch and create TextSample objects.
+
+ Note: This function decodes tokens by converting IDs to token strings and joining with spaces.
+ This bypasses the tokenizer's native .decode() logic, which may handle special tokens,
+ BPE merge undoing, and special whitespace differently. For display purposes in the dashboard,
+ this simplified approach is acceptable and significantly faster.
+
+ Args:
+ batch: Input token IDs
+ tokenizer: Tokenizer for decoding
+ text_samples: Existing text samples dict (for deduplication)
+
+ Returns:
+ List of TextSample objects for the batch
+ """
+ batch_token_strings: list[list[str]] = simple_batch_decode(
+ tokenizer, batch
+ ).tolist() # [batch_size, n_ctx] of strings
+
+ # Create text samples for entire batch
+ batch_text_samples: list[TextSample] = []
+ for token_strings in batch_token_strings:
+ text: str = " ".join(token_strings)
+ text_sample: TextSample = TextSample(full_text=text, tokens=token_strings)
+ text_samples[text_sample.text_hash] = text_sample
+ batch_text_samples.append(text_sample)
+
+ return batch_text_samples
diff --git a/spd/clustering/dashboard/core/util.py b/spd/clustering/dashboard/core/util.py
new file mode 100644
index 000000000..5f38c29a4
--- /dev/null
+++ b/spd/clustering/dashboard/core/util.py
@@ -0,0 +1,24 @@
+from pathlib import Path
+
+from spd.log import logger
+
+
+def write_html_files(output_dir: Path) -> None:
+ """Write bundled HTML files from _bundled to output directory.
+
+ Args:
+ output_dir: Directory to write HTML files to
+ """
+ import importlib.resources
+
+ # Read bundled HTML files from the _bundled package
+ bundled_package = "spd.clustering.dashboard._bundled"
+
+ index_html = importlib.resources.files(bundled_package).joinpath("index.html").read_text()
+ cluster_html = importlib.resources.files(bundled_package).joinpath("cluster.html").read_text()
+
+ # Write to output directory
+ (output_dir / "index.html").write_text(index_html)
+ (output_dir / "cluster.html").write_text(cluster_html)
+
+ logger.info(f"HTML files written to: {output_dir}")
diff --git a/spd/clustering/dashboard/core/v2.py b/spd/clustering/dashboard/core/v2.py
new file mode 100644
index 000000000..b0f18a126
--- /dev/null
+++ b/spd/clustering/dashboard/core/v2.py
@@ -0,0 +1,27 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class ClusterData:
+ # cluster id
+ # list of subcomponent keys
+ # for future reference: "ckeys" means this_cluster_id_hash|subcomponent_key
+ # activations of subcomponents on all text : dict[ckeys, dict[text_hash, activation array]]
+
+ # properties: these are stats we compute and care about. these will be kept as-is
+
+ # max activating sequences: dict[ckeys: list of text_hash]
+ # whatever other activating sequences -- i.e. maximum mean, most above threshold, etc.
+
+ # subcomponent coactivations
+ # various activation stats (across all sequences)
+ # top activating tokens
+ pass
+
+
+# now, when we store the data we care about -- call this class DashboardData, we will take in a bunch of ClusterData and put things into shared data structures.
+# i.e. we remove activations of components/subcomponents on all text from ClusterData, and instead store a big dict[(ckeys, text_hash), activation array]
+# we store text data in a big dict[text_hash]: text data
+# we store subcomponent coactivations in a big dict[cluster_id, array] (saved as npz)
+
+# BUT -- most importantly, we want this concatenation to basically be as automated as possible. we should be able to add fields freely to cluster data, and then somehow mark them as concatenated. then we want this all to be super easy to read from javascript
diff --git a/spd/clustering/dashboard/css/model-view.css b/spd/clustering/dashboard/css/model-view.css
new file mode 100644
index 000000000..fe29170d4
--- /dev/null
+++ b/spd/clustering/dashboard/css/model-view.css
@@ -0,0 +1,54 @@
+/* Model Visualization Styles */
+
+/* Default sizes - can be overridden by setting CSS variables on parent elements */
+:root {
+ --modelview-cell-size: 16px;
+ --modelview-layer-gap: 4px;
+ --modelview-module-gap: 2px;
+ --modelview-layer-margin: 4px;
+ --modelview-module-padding: 2px;
+ --modelview-border-radius: 2px;
+}
+
+/* Container for model view in standalone page */
+.modelview-container {
+ margin: 10px;
+ padding: 10px;
+}
+
+/* Container for model view in table cell (compact) */
+.modelview-cell {
+ width: 200px;
+ height: 60px;
+ --modelview-layer-gap: 1px;
+ --modelview-module-gap: 1px;
+ --modelview-layer-margin: 1px;
+ --modelview-module-padding: 1px;
+ --modelview-border-radius: 1px;
+}
+
+/* Each layer block - renders horizontally as a row */
+.modelview-layer-block {
+ margin: var(--modelview-layer-margin) 0;
+ display: flex;
+ align-items: center;
+ gap: var(--modelview-layer-gap);
+}
+
+/* Group of modules within a layer (e.g., attention or MLP) */
+.modelview-module-group {
+ display: flex;
+ gap: var(--modelview-module-gap);
+ border: 1px solid #ddd;
+ padding: var(--modelview-module-padding);
+ background: #fafafa;
+}
+
+/* Individual module cell */
+.modelview-module-cell {
+ width: var(--modelview-cell-size);
+ height: var(--modelview-cell-size);
+ background: #f0f0f0;
+ border-radius: var(--modelview-border-radius);
+ cursor: pointer;
+}
diff --git a/spd/clustering/dashboard/css/notif.css b/spd/clustering/dashboard/css/notif.css
new file mode 100644
index 000000000..b20f202a9
--- /dev/null
+++ b/spd/clustering/dashboard/css/notif.css
@@ -0,0 +1,104 @@
+/* Notification system styling */
+.notification-container {
+ position: fixed;
+ top: 0;
+ left: 50%;
+ transform: translateX(-50%);
+ z-index: 1001;
+ pointer-events: none;
+}
+
+.notification-indicator {
+ position: absolute;
+ left: 50%;
+ transform: translateX(-50%);
+ background: rgba(0, 34, 68, 0.95);
+ color: #00ff00;
+ padding: 12px 24px;
+ border-radius: 8px;
+ font-family: 'Courier New', monospace;
+ font-size: 12px;
+ font-weight: bold;
+ display: block;
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
+ border: 1px solid #00ff00;
+ min-width: 500px;
+ text-align: center;
+ transition: all 0.3s ease;
+ opacity: 1;
+ margin-bottom: 10px;
+}
+
+/* Success styling */
+.notification-indicator.success {
+ background: rgba(0, 68, 34, 0.95);
+ border-color: #00ff88;
+ color: #00ff88;
+}
+
+/* Error styling */
+.notification-indicator.error {
+ background: rgba(68, 0, 0, 0.95);
+ border-color: #ff4444;
+ color: #ff4444;
+}
+
+/* Loading spinner */
+.notification-spinner {
+ display: inline-block;
+ width: 12px;
+ height: 12px;
+ border: 2px solid #004400;
+ border-radius: 50%;
+ border-top-color: #00ff00;
+ animation: spin 1s ease-in-out infinite;
+ margin-right: 8px;
+ vertical-align: middle;
+}
+
+.notification-indicator.success .notification-spinner {
+ border-color: #004400;
+ border-top-color: #00ff88;
+}
+
+.notification-indicator.error .notification-spinner {
+ display: none;
+ /* No spinner for errors */
+}
+
+/* Progress bar components */
+.notification-progress-container {
+ width: 100%;
+ height: 6px;
+ background: rgba(255, 255, 255, 0.1);
+ border-radius: 3px;
+ margin-top: 8px;
+ overflow: hidden;
+}
+
+.notification-progress-bar {
+ height: 100%;
+ background: #00ff00;
+ border-radius: 3px;
+ transition: width 0.3s ease;
+ width: 0%;
+}
+
+.notification-indicator.success .notification-progress-bar {
+ background: #00ff88;
+}
+
+.notification-indicator.error .notification-progress-bar {
+ background: #ff4444;
+}
+
+@keyframes spin {
+ to {
+ transform: rotate(360deg);
+ }
+}
+
+/* Hide spinner when not in spinner mode */
+.notification-indicator:not(.spinner) .notification-spinner {
+ display: none;
+}
\ No newline at end of file
diff --git a/spd/clustering/dashboard/css/styles.css b/spd/clustering/dashboard/css/styles.css
new file mode 100644
index 000000000..0f6606c86
--- /dev/null
+++ b/spd/clustering/dashboard/css/styles.css
@@ -0,0 +1,104 @@
+body {
+ font-family: monospace;
+ margin: 20px;
+}
+
+h1, h2, h3 {
+ font-weight: normal;
+}
+
+table {
+ border-collapse: collapse;
+ width: 100%;
+ margin: 20px 0;
+}
+
+th, td {
+ border: 1px solid #ccc;
+ padding: 8px;
+ text-align: left;
+}
+
+th {
+ background: #f0f0f0;
+ cursor: pointer;
+ user-select: none;
+}
+
+th:hover {
+ background: #e0e0e0;
+}
+
+a {
+ color: blue;
+ text-decoration: underline;
+}
+
+#loading {
+ margin: 20px;
+ color: #666;
+}
+
+#components ul {
+ list-style: none;
+ padding: 0;
+}
+
+#components li {
+ padding: 4px 0;
+ font-family: monospace;
+}
+
+#samplesTable {
+ width: auto;
+ max-width: 100%;
+}
+
+#samplesTable td:last-child {
+ min-width: 500px;
+}
+
+/* Token activation statistics */
+.stats-grid {
+ display: grid;
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+ gap: 15px;
+ margin: 20px 0;
+}
+
+.stat-card {
+ border: 1px solid #ccc;
+ padding: 15px;
+ background: #f9f9f9;
+}
+
+.stat-label {
+ font-size: 12px;
+ color: #666;
+ margin-bottom: 5px;
+}
+
+.stat-value {
+ font-size: 24px;
+ font-weight: bold;
+ color: #333;
+}
+
+.stat-help {
+ font-size: 11px;
+ color: #999;
+ margin-top: 5px;
+}
+
+.token-display {
+ background: #f5f5f5;
+ padding: 2px 6px;
+ border-radius: 3px;
+ font-family: monospace;
+ font-size: 13px;
+ border: 1px solid #ddd;
+}
+
+#tokenActivations {
+ margin: 0;
+}
\ No newline at end of file
diff --git a/spd/clustering/dashboard/css/token-display.css b/spd/clustering/dashboard/css/token-display.css
new file mode 100644
index 000000000..e84f6d994
--- /dev/null
+++ b/spd/clustering/dashboard/css/token-display.css
@@ -0,0 +1,24 @@
+/* Token visualization styles */
+.token {
+ display: inline-block;
+ padding: 2px 4px;
+ margin: 0 1px;
+ border-radius: 2px;
+ cursor: help;
+ position: relative;
+ background: yellow;
+}
+
+.token::before {
+ content: attr(data-tip);
+ visibility: hidden;
+ position: absolute;
+ bottom: 100%;
+ background: #000;
+ color: #fff;
+ padding: 4px;
+}
+
+.token:hover::before {
+ visibility: visible;
+}
\ No newline at end of file
diff --git a/spd/clustering/dashboard/dashboard_config.yaml b/spd/clustering/dashboard/dashboard_config.yaml
new file mode 100644
index 000000000..49edd346f
--- /dev/null
+++ b/spd/clustering/dashboard/dashboard_config.yaml
@@ -0,0 +1,28 @@
+# Dashboard Configuration
+# This config is used by the clustering dashboard to compute max-activating text samples
+
+# WandB clustering run path (format: entity/project/run_id or wandb:entity/project/runs/run_id)
+# wandb_run: "goodfire/spd-cluster/runs/e5gbybvn"
+wandb_run: "goodfire/spd-cluster/runs/c-c3623a67"
+
+# Merge iteration to analyze (negative indexes from end, -1 = latest)
+# iteration: 4600
+iteration: 4795
+
+# Number of top-activating samples to collect per cluster
+n_samples: 16
+
+# Number of data batches to process
+n_batches: 1
+
+# Batch size for data loading
+batch_size: 64
+
+# Context length for tokenization
+context_length: 64
+
+# Write bundled HTML files to output directory
+write_html: false
+
+# Base output directory (optional, defaults to REPO_ROOT/spd/clustering/dashboard/data/)
+# output_dir: "path/to/custom/output"
diff --git a/spd/clustering/dashboard/frontend_config.json b/spd/clustering/dashboard/frontend_config.json
new file mode 100644
index 000000000..8c14275d0
--- /dev/null
+++ b/spd/clustering/dashboard/frontend_config.json
@@ -0,0 +1,19 @@
+{
+ "data": {
+ "dataDir": "data/c-c3623a67-i4795/data"
+ },
+ "indexPage": {
+ "pageSize": 25,
+ "pageSizeOptions": [10, 25, 50, 100],
+ "showFilters": true
+ },
+ "clusterPage": {
+ "pageSize": 25,
+ "maxSamplesPerCluster": 32,
+ "showFilters": true
+ },
+ "visualization": {
+ "colormap": "blues",
+ "histogramBins": 10
+ }
+}
diff --git a/spd/clustering/dashboard/html-tests/test-token-display.html b/spd/clustering/dashboard/html-tests/test-token-display.html
new file mode 100644
index 000000000..ecbbe56a3
--- /dev/null
+++ b/spd/clustering/dashboard/html-tests/test-token-display.html
@@ -0,0 +1,51 @@
+
+
+
+ Token Display Test
+
+
+
+
+ Token Display Test
+
+ Example 1: BERT-style (default red)
+
+
+ Example 2: BERT-style with blue color scheme
+
+
+ Example 3: GPT2-style tokenization
+
+
+ Example 4: GPT2-style with custom color [255, 128, 0]
+
+
+
+
+
+
diff --git a/spd/clustering/dashboard/index.html b/spd/clustering/dashboard/index.html
new file mode 100644
index 000000000..604e1fbb4
--- /dev/null
+++ b/spd/clustering/dashboard/index.html
@@ -0,0 +1,150 @@
+
+
+
+ Cluster Selection
+
+
+
+
+
+
+
+ Cluster Selection Dashboard
+
+
+
Model Information
+
+
+ Total Modules:
+
+
+ Total Components:
+
+
+ Total Clusters:
+
+
+ Model Parameters:
+
+
+ Iteration:
+
+
+ Component Size:
+
+
+ Source SPD Run:
+
+
+ Clustering Run:
+
+
+ Pretrained Model:
+
+
+
+ Configuration Details
+
+
+ Seed:
+
+
+ Steps:
+
+
+ Learning Rate:
+
+
+ Batch Size:
+
+
+ Sigmoid Type:
+
+
+ Sampling:
+
+
+ LR Schedule:
+
+
+ Output Loss:
+
+
+
+
+
+
+ Loading data...
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/spd/clustering/dashboard/js/cluster-detail.js b/spd/clustering/dashboard/js/cluster-detail.js
new file mode 100644
index 000000000..edd41d11e
--- /dev/null
+++ b/spd/clustering/dashboard/js/cluster-detail.js
@@ -0,0 +1,671 @@
+let clusterData = null;
+let allClusters = null;
+let textSamples = {};
+let currentClusterHash = null;
+let modelInfo = {};
+// TODO: Re-enable explanations feature
+// let explanations = {};
+
+// DEPRECATED: activationsArray and activationsMap no longer needed - data is self-contained in cluster.samples
+
+// Component-level data
+let componentActivations = {}; // Map component labels to their activation data
+let enabledComponents = new Set(); // Track which components are enabled
+let combinationStrategy = 'max'; // How to combine component activations: 'max', 'sum', 'mean'
+
+async function init() {
+ // Get cluster hash from URL
+ const urlParams = new URLSearchParams(window.location.search);
+ currentClusterHash = urlParams.get('id');
+
+ if (!currentClusterHash) {
+ const loading = document.getElementById('loading');
+ if (!loading) {
+ const msg = 'Fatal error: loading element not found in HTML';
+ NOTIF.error(msg, null, null);
+ console.error(msg);
+ return;
+ }
+ loading.textContent = 'No cluster ID specified';
+ return;
+ }
+
+ await loadData();
+}
+
+async function loadData() {
+ try {
+ // Load all data via ZANJ
+ const loader = new ZanjLoader(CONFIG.data.dataDir);
+ const data = await loader.read();
+
+ allClusters = data.clusters;
+ textSamples = data.text_samples;
+ modelInfo = data.model_info;
+
+ // TODO: Re-enable explanations feature
+ // Load explanations separately (not part of ZANJ)
+ // const explanationsPath = CONFIG.getDataPath('explanations');
+ // explanations = await loadJSONL(explanationsPath, 'cluster_id').catch(() => ({}));
+
+ if (!allClusters[currentClusterHash]) {
+ const msg = 'Cluster not found';
+ NOTIF.error(msg, null, null);
+ const loading = document.getElementById('loading');
+ if (loading) {
+ loading.textContent = msg;
+ } else {
+ console.error('loading element not found, cannot display error message');
+ }
+ return;
+ }
+
+ clusterData = allClusters[currentClusterHash];
+
+ displayCluster();
+ const loading = document.getElementById('loading');
+ if (!loading) {
+ const msg = 'Fatal error: loading element not found in HTML';
+ NOTIF.error(msg, null, null);
+ console.error(msg);
+ return;
+ }
+ loading.style.display = 'none';
+ } catch (error) {
+ console.error('Load error:', error);
+ console.error('Stack:', error.stack);
+ NOTIF.error('Failed to load cluster data: ' + error.message, error, null);
+ }
+}
+
+async function displayCluster() {
+ // Update title
+ const clusterTitle = document.getElementById('clusterTitle');
+ if (!clusterTitle) {
+ const msg = 'Fatal error: clusterTitle element not found in HTML';
+ NOTIF.error(msg, null, null);
+ console.error(msg);
+ return;
+ }
+ clusterTitle.textContent = `Cluster ${currentClusterHash}`;
+
+ // Display component count
+ const componentCount = document.getElementById('componentCount');
+ if (!componentCount) {
+ const msg = 'Fatal error: componentCount element not found in HTML';
+ NOTIF.error(msg, null, null);
+ console.error(msg);
+ return;
+ }
+ // Await lazy-loaded components
+ const components = await clusterData.components;
+ componentCount.textContent = components.length;
+
+ // TODO: Re-enable explanations feature
+ // Display explanation and setup copy handler
+ // displayExplanation();
+ // setupCopyHandler();
+
+ // Initialize component data
+ await initializeComponentData();
+
+ // Display model visualization
+ await displayModelVisualization();
+
+ // Setup components table
+ await setupComponentsTable();
+
+ // Setup hover highlighting between model view and components table
+ setupModelViewHighlighting();
+
+ // Display histogram plots
+ displayHistograms();
+
+ // Display token activation stats if available
+ if (clusterData.stats && clusterData.stats.token_activations) {
+ displayTokenActivations();
+ }
+
+ // Display samples
+ await displaySamples();
+}
+
+// TODO: Re-enable explanations feature
+// function displayExplanation() {
+// const explanationSpan = document.getElementById('clusterExplanation');
+// if (!explanationSpan) return;
+//
+// const explanationData = explanations[currentClusterHash];
+// if (explanationData && explanationData.explanation) {
+// explanationSpan.textContent = explanationData.explanation;
+// explanationSpan.style.fontStyle = 'normal';
+// explanationSpan.style.color = '#000';
+// } else {
+// explanationSpan.textContent = 'No explanation';
+// explanationSpan.style.fontStyle = 'italic';
+// explanationSpan.style.color = '#666';
+// }
+// }
+//
+// function setupCopyHandler() {
+// const copyBtn = document.getElementById('copyTemplateBtn');
+// if (!copyBtn) return;
+//
+// copyBtn.addEventListener('click', async () => {
+// const template = JSON.stringify({
+// cluster_id: currentClusterHash,
+// explanation: ""
+// }) + '\n';
+//
+// try {
+// await navigator.clipboard.writeText(template);
+// NOTIF.success('Template copied to clipboard!');
+// } catch (err) {
+// // Fallback for older browsers
+// const textArea = document.createElement('textarea');
+// textArea.value = template;
+// textArea.style.position = 'fixed';
+// textArea.style.left = '-999999px';
+// document.body.appendChild(textArea);
+// textArea.select();
+// try {
+// document.execCommand('copy');
+// NOTIF.success('Template copied to clipboard!');
+// } catch (e) {
+// NOTIF.error('Failed to copy template', e, null);
+// }
+// document.body.removeChild(textArea);
+// }
+// });
+// }
+
+async function initializeComponentData() {
+ // Load component activations if available
+ if (clusterData.component_activations) {
+ componentActivations = clusterData.component_activations;
+ }
+
+ // Enable all components by default
+ enabledComponents.clear();
+ // Await lazy-loaded components
+ const components = await clusterData.components;
+ components.forEach(comp => {
+ enabledComponents.add(comp.label);
+ });
+}
+
+async function displayModelVisualization() {
+ const modelViewDiv = document.getElementById('modelView');
+ if (!modelViewDiv) {
+ const msg = 'Fatal error: modelView element not found in HTML';
+ NOTIF.error(msg, null, null);
+ console.error(msg);
+ return;
+ }
+ await renderModelView(modelViewDiv, currentClusterHash, allClusters, modelInfo, CONFIG.visualization.colormap, CONFIG.visualization.modelViewCellSize);
+}
+
+function displayHistograms() {
+ const stats = clusterData.stats;
+ if (!stats) return;
+
+ const histogramPlots = document.getElementById('histogramPlots');
+ if (!histogramPlots) {
+ const msg = 'Fatal error: histogramPlots element not found in HTML';
+ NOTIF.error(msg, null, null);
+ console.error(msg);
+ return;
+ }
+ histogramPlots.innerHTML = '';
+
+ // Color mapping for different histogram types
+ const statColors = {
+ 'all_activations': '#4169E1',
+ 'max_activation-max-16': '#DC143C',
+ 'max_activation-max-32': '#DC143C',
+ 'mean_activation-max-16': '#228B22',
+ 'median_activation-max-16': '#FF8C00',
+ 'min_activation-max-16': '#9370DB',
+ 'max_activation_position': '#FF6347'
+ };
+
+ // Discover all histogram stats
+ const histogramStats = [];
+ for (const [key, value] of Object.entries(stats)) {
+ if (value && typeof value === 'object' && 'bin_counts' in value && 'bin_edges' in value) {
+ histogramStats.push(key);
+ }
+ }
+
+ // Create a plot for each histogram stat
+ histogramStats.forEach(statKey => {
+ const histData = stats[statKey];
+ const color = statColors[statKey] || '#808080';
+ const label = statKey.replace(/-/g, ' ').replace(/_/g, ' ')
+ .split(' ')
+ .map(word => word.charAt(0).toUpperCase() + word.slice(1))
+ .join(' ');
+
+ // Create container for this plot
+ const plotContainer = document.createElement('div');
+ plotContainer.style.display = 'flex';
+ plotContainer.style.flexDirection = 'column';
+ plotContainer.style.alignItems = 'center';
+ plotContainer.style.minWidth = '250px';
+
+ // Add label
+ const plotLabel = document.createElement('div');
+ plotLabel.textContent = label;
+ plotLabel.style.fontSize = '12px';
+ plotLabel.style.fontWeight = 'bold';
+ plotLabel.style.marginBottom = '5px';
+ plotLabel.style.textAlign = 'center';
+ plotContainer.appendChild(plotLabel);
+
+ // Create sparkline
+ const sparklineContainer = document.createElement('div');
+ sparklineContainer.className = 'sparkline-cell';
+
+ // Calculate bin centers for x-axis
+ const binCenters = calculateBinCenters(histData.bin_edges);
+
+ const min = histData.bin_edges[0];
+ const max = histData.bin_edges[histData.bin_edges.length - 1];
+
+ // Set x-axis limits to [0, 1] if data is in that range
+ const xlims = (min >= 0 && max <= 1) ? [0, 1] : null;
+
+ const svg = sparkbars(binCenters, histData.bin_counts, {
+ width: CONFIG.visualization.sparklineWidth || 200,
+ height: CONFIG.visualization.sparklineHeight || 60,
+ color: color,
+ shading: true,
+ lineWidth: 0,
+ markers: '',
+ margin: 2,
+ xlims: xlims,
+ ylims: [0, null],
+ logScale: true,
+ xAxis: {line: true, ticks: true, label_margin: 10},
+ yAxis: {line: true, ticks: true, label_margin: CONFIG.visualization.sparklineYAxisMargin || 35}
+ });
+
+ sparklineContainer.innerHTML = svg;
+
+ // Add tooltip with statistics
+ const mean = calculateHistogramMean(histData);
+ const median = calculateHistogramMedian(histData);
+ const totalCount = histData.bin_counts.reduce((a, b) => a + b, 0);
+ sparklineContainer.title = `${label} (n=${totalCount})\n\nMin: ${min.toFixed(4)}\nMax: ${max.toFixed(4)}\nMean: ${mean.toFixed(4)}\nMedian: ${median.toFixed(4)}`;
+
+ plotContainer.appendChild(sparklineContainer);
+ histogramPlots.appendChild(plotContainer);
+ });
+}
+
+function displayTokenActivations() {
+ const tokenStats = clusterData.stats.token_activations;
+
+ // Show the section
+ const tokenActivations = document.getElementById('tokenActivations');
+ if (!tokenActivations) {
+ const msg = 'Fatal error: tokenActivations element not found in HTML';
+ NOTIF.error(msg, null, null);
+ console.error(msg);
+ return;
+ }
+ tokenActivations.style.display = 'block';
+
+ // Setup top tokens table
+ if (tokenStats.top_tokens && tokenStats.top_tokens.length > 0) {
+ const tableData = tokenStats.top_tokens.map((item, idx) => ({
+ rank: idx + 1,
+ token: item.token,
+ count: item.count,
+ percentage: ((item.count / tokenStats.total_activations) * 100)
+ }));
+
+ const maxPercentage = tableData.length > 0 ? tableData[0].percentage : 0;
+
+ const tableConfig = {
+ data: tableData,
+ columns: [
+ {
+ key: 'rank',
+ label: '#',
+ type: 'number',
+ width: '40px',
+ align: 'right'
+ },
+ {
+ key: 'token',
+ label: 'Token',
+ type: 'string',
+ width: '120px',
+ renderer: (value) => {
+ // Show token in a monospace box with visual formatting
+ const tokenDisplay = value.replace(/ /g, '·').replace(/\n/g, '↵');
+ return `${tokenDisplay}`;
+ }
+ },
+ {
+ key: 'percentage',
+ label: '%',
+ type: 'number',
+ width: '70px',
+ align: 'right',
+ renderer: (value) => {
+ const percentageValue = value;
+ const percentage = percentageValue.toFixed(1);
+
+ // Color based on percentage (normalized by max percentage)
+ const normalizedPct = maxPercentage > 0 ? percentageValue / maxPercentage : 0;
+ const intensity = Math.floor((1 - normalizedPct) * 255);
+ const bgColor = `rgb(255, ${intensity}, ${intensity})`;
+
+ const span = document.createElement('span');
+ span.textContent = `${percentage}%`;
+ span.style.backgroundColor = bgColor;
+ span.style.padding = '2px 4px';
+ span.style.borderRadius = '2px';
+
+ return span;
+ },
+ infoFunction: () => {
+ return `Unique: ${tokenStats.total_unique_tokens.toLocaleString()} | Total: ${tokenStats.total_activations.toLocaleString()} | Entropy: ${tokenStats.entropy.toFixed(2)} | Conc: ${(tokenStats.concentration_ratio * 100).toFixed(1)}%`;
+ }
+ }
+ ],
+ pageSize: 10,
+ showFilters: false,
+ showInfo: true
+ };
+
+ new DataTable('#topTokensTable', tableConfig);
+ }
+}
+
+async function setupComponentsTable() {
+ // Debug: check what clusterData.components actually is
+ console.log("clusterData.components (before await):", clusterData.components);
+ console.log("typeof clusterData.components:", typeof clusterData.components);
+ console.log("clusterData.components.then:", clusterData.components.then);
+
+ // Await lazy-loaded components
+ const components = await clusterData.components;
+
+ console.log("components (after await):", components);
+ console.log("typeof components:", typeof components);
+ console.log("Array.isArray(components):", Array.isArray(components));
+
+ // Fail fast: verify components is a valid array
+ if (!Array.isArray(components)) {
+ console.error("components:", components);
+ console.error("components keys:", Object.keys(components));
+ console.error("clusterData:", clusterData);
+ throw new Error(`clusterData.components is not an array: ${typeof components}. Keys: ${Object.keys(components).join(", ")}`);
+ }
+
+ // Fail fast: check for undefined/null entries
+ const badIndex = components.findIndex(comp => comp === undefined || comp === null);
+ if (badIndex !== -1) {
+ throw new Error(`clusterData.components contains undefined/null at index ${badIndex}`);
+ }
+
+ const tableData = components.map(comp => ({
+ label: comp.label,
+ module: comp.module,
+ index: comp.index,
+ enabled: enabledComponents.has(comp.label)
+ }));
+
+ const tableConfig = {
+ data: tableData,
+ columns: [
+ {
+ key: 'enabled',
+ label: '✓',
+ type: 'boolean',
+ width: '40px',
+ align: 'center',
+ renderer: (value, row) => {
+ const checkbox = document.createElement('input');
+ checkbox.type = 'checkbox';
+ checkbox.checked = value;
+ checkbox.style.cursor = 'pointer';
+ checkbox.addEventListener('change', (e) => {
+ onComponentToggle(row.label, e.target.checked);
+ });
+ return checkbox;
+ },
+ filterable: false
+ },
+ {
+ key: 'module',
+ label: 'Module',
+ type: 'string',
+ width: '250px'
+ },
+ {
+ key: 'index',
+ label: 'Index',
+ type: 'number',
+ width: '80px',
+ align: 'right'
+ }
+ ],
+ pageSize: CONFIG.clusterPage.pageSize,
+ showFilters: false
+ };
+
+ new DataTable('#componentsTable', tableConfig);
+}
+
+function onComponentToggle(componentLabel, isEnabled) {
+ if (isEnabled) {
+ enabledComponents.add(componentLabel);
+ } else {
+ enabledComponents.delete(componentLabel);
+ }
+
+ // Recompute and redisplay activations
+ recomputeDisplayedActivations();
+}
+
+async function recomputeDisplayedActivations() {
+ // If no components are enabled or component activations not available, use cluster-level
+ if (enabledComponents.size === 0 || !componentActivations || Object.keys(componentActivations).length === 0) {
+ // Just redisplay with cluster-level activations (default)
+ await displaySamples();
+ return;
+ }
+
+ // Await lazy-loaded components
+ const components = await clusterData.components;
+
+ // If all components are enabled, use cluster-level activations (faster)
+ if (enabledComponents.size === components.length) {
+ await displaySamples();
+ return;
+ }
+
+ // Recompute activations based on enabled components
+ await displaySamples();
+}
+
+function combineComponentActivations(componentActsList, strategy) {
+ // componentActsList: array of activation arrays [n_ctx]
+ // Returns: combined activation array [n_ctx]
+
+ if (componentActsList.length === 0) {
+ return null;
+ }
+
+ if (componentActsList.length === 1) {
+ return componentActsList[0];
+ }
+
+ const n_ctx = componentActsList[0].length;
+ const combined = new Array(n_ctx).fill(0);
+
+ if (strategy === 'max') {
+ for (let i = 0; i < n_ctx; i++) {
+ let maxVal = componentActsList[0][i];
+ for (let j = 1; j < componentActsList.length; j++) {
+ if (componentActsList[j][i] > maxVal) {
+ maxVal = componentActsList[j][i];
+ }
+ }
+ combined[i] = maxVal;
+ }
+ } else if (strategy === 'sum') {
+ for (let i = 0; i < n_ctx; i++) {
+ let sum = 0;
+ for (let j = 0; j < componentActsList.length; j++) {
+ sum += componentActsList[j][i];
+ }
+ combined[i] = sum;
+ }
+ } else if (strategy === 'mean') {
+ for (let i = 0; i < n_ctx; i++) {
+ let sum = 0;
+ for (let j = 0; j < componentActsList.length; j++) {
+ sum += componentActsList[j][i];
+ }
+ combined[i] = sum / componentActsList.length;
+ }
+ }
+
+ return combined;
+}
+
+function setupModelViewHighlighting() {
+ // Get all model view cells
+ const modelViewCells = document.querySelectorAll('.modelview-module-cell');
+
+ // Get components table
+ const componentsTable = document.querySelector('#componentsTable');
+ if (!componentsTable) return;
+
+ modelViewCells.forEach(cell => {
+ cell.addEventListener('mouseenter', (e) => {
+ const moduleName = e.target.dataset.module;
+ if (!moduleName) return;
+
+ // Find and highlight all rows in the components table that match this module
+ const tableRows = componentsTable.querySelectorAll('.tablejs-data-row');
+ tableRows.forEach(row => {
+ const cells = row.querySelectorAll('td');
+ if (cells.length > 1) {
+ const moduleCell = cells[1]; // Second column is module name (first is checkbox)
+ if (moduleCell && moduleCell.textContent === moduleName) {
+ row.style.backgroundColor = '#fff3cd'; // Light yellow highlight
+ }
+ }
+ });
+ });
+
+ cell.addEventListener('mouseleave', () => {
+ // Remove highlighting from all rows
+ const tableRows = componentsTable.querySelectorAll('.tablejs-data-row');
+ tableRows.forEach(row => {
+ row.style.backgroundColor = '';
+ });
+ });
+ });
+}
+
+async function displaySamples() {
+ const tbody = document.getElementById('samplesTableBody');
+ if (!tbody) {
+ const msg = 'Fatal error: samplesTableBody element not found in HTML';
+ NOTIF.error(msg, null, null);
+ console.error(msg);
+ return;
+ }
+ tbody.innerHTML = '';
+
+ // Use self-contained samples from cluster data
+ const samples = clusterData.samples || [];
+ if (samples.length === 0) {
+ tbody.innerHTML = 'No samples available ';
+ return;
+ }
+
+ const samplesToShow = Math.min(CONFIG.clusterPage.maxSamplesPerCluster, samples.length);
+
+ for (let i = 0; i < samplesToShow; i++) {
+ const sample = samples[i];
+ const textSample = textSamples[sample.text_hash];
+
+ if (!textSample) {
+ console.warn(`Text sample not found for hash: ${sample.text_hash}`);
+ continue;
+ }
+
+ // Debug: Log the actual sample structure
+ console.log(`Sample ${i} structure:`, {
+ keys: Object.keys(sample),
+ sample: sample,
+ hasActivations: 'activations' in sample,
+ activationsType: typeof sample.activations,
+ activationsValue: sample.activations
+ });
+
+ // Activations might be a ZANJ Proxy (lazy-loaded .npy reference)
+ // Need to await it to get the actual NDArray object
+ const activations = await sample.activations;
+
+ // The NDArray object has the actual data in the .data property (Float32Array)
+ // Convert to regular array for visualization
+ const activationsData = activations.data
+ ? Array.from(activations.data)
+ : (Array.isArray(activations) ? activations : Array.from(activations));
+
+ // Fail immediately if activations are missing or empty
+ if (!activationsData || activationsData.length === 0) {
+ console.error('sample:', sample);
+ console.error('activations:', activations);
+ console.error('activationsData:', activationsData);
+ throw new Error(
+ `No activations found for sample ${i} in cluster ${currentClusterHash}.\n` +
+ `Sample structure: ${JSON.stringify(Object.keys(sample))}\n` +
+ `sample.activations type: ${typeof sample.activations}\n` +
+ `activations after await type: ${typeof activations}\n` +
+ `activationsData length: ${activationsData?.length}\n` +
+ `Expected: Array or ArrayLike with length > 0`
+ );
+ }
+
+ const tokenViz = createTokenVisualization(
+ sample.tokens,
+ activationsData
+ );
+
+ const tr = document.createElement('tr');
+ tr.innerHTML = `
+ ${i + 1}
+
+ `;
+
+ // Add token visualization to last cell
+ tr.lastElementChild.appendChild(tokenViz);
+
+ tbody.appendChild(tr);
+ }
+
+ if (samples.length > CONFIG.clusterPage.maxSamplesPerCluster) {
+ const tr = document.createElement('tr');
+ tr.innerHTML = `
+ ... and ${samples.length - CONFIG.clusterPage.maxSamplesPerCluster} more samples
+ `;
+ tbody.appendChild(tr);
+ }
+}
+
+
+// Initialize config and load data on page load
+document.addEventListener('DOMContentLoaded', async () => {
+ await initConfig();
+ init();
+});
\ No newline at end of file
diff --git a/spd/clustering/dashboard/js/cluster-selection.js b/spd/clustering/dashboard/js/cluster-selection.js
new file mode 100644
index 000000000..f90a052df
--- /dev/null
+++ b/spd/clustering/dashboard/js/cluster-selection.js
@@ -0,0 +1,838 @@
+let clusterData = {};
+let modelInfo = {};
+let dataTable = null;
+// TODO: Re-enable explanations feature
+// let explanations = {};
+
+// Alpine.js data component for model info
+const modelInfoData = {
+ data: {},
+ hasData: false,
+
+ formatParameters(totalParams) {
+ if (!totalParams) return '-';
+ if (totalParams >= 1000000) return (totalParams / 1000000).toFixed(1) + 'M';
+ if (totalParams >= 1000) return (totalParams / 1000).toFixed(1) + 'K';
+ return totalParams.toString();
+ },
+
+ formatWandBLink(path) {
+ if (!path) return '-';
+
+ // Remove "wandb:" prefix if present
+ const cleanPath = path.replace(/^wandb:/, '');
+
+ // Convert to WandB URL
+ const url = `https://wandb.ai/${cleanPath}`;
+
+ // Show shortened path in link text
+ const displayText = cleanPath.length > 60
+ ? cleanPath.substring(0, 57) + '...'
+ : cleanPath;
+
+ return `${displayText} `;
+ }
+};
+
+// Custom column renderers
+const columnRenderers = {
+ modelView: function(value, row, col) {
+ const container = document.createElement('div');
+ container.className = 'modelview-cell';
+
+ renderModelView(container, row.clusterHash, clusterData, modelInfo, CONFIG.visualization.colormap, CONFIG.visualization.modelViewCellSizeTable);
+
+ return container;
+ },
+
+ modulesSummary: function(value, row, col) {
+ const modules = row.modules;
+ const container = document.createElement('div');
+ container.className = 'module-summary';
+
+ if (modules.length === 1) {
+ const parts = modules[0].split('.');
+ container.textContent = parts.length > 2 ? parts.slice(-2).join('.') : modules[0];
+ } else if (modules.length <= 3) {
+ container.textContent = modules.map(m => {
+ const parts = m.split('.');
+ return parts.length > 2 ? parts.slice(-2).join('.') : m;
+ }).join(', ');
+ } else {
+ container.textContent = `${modules.length} modules`;
+ }
+
+ container.title = modules.join('\n');
+ return container;
+ },
+
+ activationHistogram: function(value, row, col) {
+ const histData = row.stats.all_activations;
+ if (!histData) {
+ return 'No data ';
+ }
+
+ const container = document.createElement('div');
+ container.className = 'sparkline-cell';
+
+ // Calculate bin centers for x-axis
+ const binCenters = calculateBinCenters(histData.bin_edges);
+
+ const min = row.stats.min_activation;
+ const max = row.stats.max_activation;
+
+ // Set x-axis limits to [0, 1] if data is in that range
+ const xlims = (min >= 0 && max <= 1) ? [0, 1] : null;
+
+ // Pass bin centers as x-values and counts as y-values
+ const svg = sparkbars(binCenters, histData.bin_counts, {
+ width: CONFIG.visualization.sparklineWidth,
+ height: CONFIG.visualization.sparklineHeight,
+ color: '#4169E1',
+ shading: true,
+ lineWidth: 0,
+ markers: '',
+ margin: 2,
+ xlims: xlims,
+ ylims: [0, null],
+ logScale: true,
+ xAxis: {line: true, ticks: true, label_margin: 10},
+ yAxis: {line: true, ticks: true, label_margin: CONFIG.visualization.sparklineYAxisMargin}
+ });
+
+ container.innerHTML = svg;
+
+ const mean = row.stats.mean_activation;
+ const median = calculateHistogramMedian(histData);
+ const n = row.stats.n_tokens;
+
+ container.title = `All Activations Histogram (n=${n})\n\nMin: ${min.toFixed(4)}\nMax: ${max.toFixed(4)}\nMean: ${mean.toFixed(4)}\nMedian: ${median.toFixed(4)}`;
+
+ return container;
+ },
+
+ maxActivationDistribution: function(value, row, col) {
+ const histData = row.stats['max_activation-max-16'];
+ if (!histData) {
+ return 'No data ';
+ }
+
+ const container = document.createElement('div');
+ container.className = 'sparkline-cell';
+
+ // Calculate bin centers for x-axis
+ const binCenters = calculateBinCenters(histData.bin_edges);
+
+ const min = histData.bin_edges[0];
+ const max = histData.bin_edges[histData.bin_edges.length - 1];
+
+ // Set x-axis limits to [0, 1] if data is in that range
+ const xlims = (min >= 0 && max <= 1) ? [0, 1] : null;
+
+ // Pass bin centers as x-values and counts as y-values
+ const svg = sparkbars(binCenters, histData.bin_counts, {
+ width: CONFIG.visualization.sparklineWidth,
+ height: CONFIG.visualization.sparklineHeight,
+ color: '#DC143C',
+ shading: true,
+ lineWidth: 0,
+ markers: '',
+ margin: 2,
+ xlims: xlims,
+ ylims: [0, null],
+ logScale: true,
+ xAxis: {line: true, ticks: true, label_margin: 10},
+ yAxis: {line: true, ticks: true, label_margin: CONFIG.visualization.sparklineYAxisMargin}
+ });
+
+ container.innerHTML = svg;
+
+ const n = row.stats.n_samples;
+ const mean = calculateHistogramMean(histData);
+ const median = calculateHistogramMedian(histData);
+
+ container.title = `Max Activation Distribution (n=${n} samples)\n\nMin: ${min.toFixed(4)}\nMax: ${max.toFixed(4)}\nMean: ${mean.toFixed(4)}\nMedian: ${median.toFixed(4)}`;
+
+ return container;
+ },
+
+ clusterLink: function(value, row, col) {
+ return `View → `;
+ },
+
+ // TODO: Re-enable explanations feature
+ // explanation: function(value, row, col) {
+ // if (!value) {
+ // return '— ';
+ // }
+ // // Truncate long explanations
+ // const maxLength = 60;
+ // if (value.length > maxLength) {
+ // const truncated = value.substring(0, maxLength) + '...';
+ // const span = document.createElement('span');
+ // span.textContent = truncated;
+ // span.title = value; // Show full text on hover
+ // return span;
+ // }
+ // return value;
+ // },
+
+ tokenEntropy: function(value, row, col) {
+ const tokenStats = row.stats.token_activations;
+ if (!tokenStats) {
+ return 'N/A ';
+ }
+ return tokenStats.entropy.toFixed(2);
+ },
+
+ tokenConcentration: function(value, row, col) {
+ const tokenStats = row.stats.token_activations;
+ if (!tokenStats) {
+ return 'N/A ';
+ }
+ return (tokenStats.concentration_ratio * 100).toFixed(1) + '%';
+ },
+
+ topToken: function(value, row, col) {
+ const tokenStats = row.stats.token_activations;
+ if (!tokenStats || !tokenStats.top_tokens || tokenStats.top_tokens.length === 0) {
+ return 'N/A ';
+ }
+
+ const container = document.createElement('div');
+ container.style.fontFamily = 'monospace';
+ container.style.fontSize = '11px';
+ container.style.lineHeight = '1.4';
+
+ const topN = Math.min(5, tokenStats.top_tokens.length);
+ const maxPercentage = tokenStats.top_tokens.length > 0
+ ? ((tokenStats.top_tokens[0].count / tokenStats.total_activations) * 100)
+ : 0;
+
+ for (let i = 0; i < topN; i++) {
+ const token = tokenStats.top_tokens[i];
+ const tokenDisplay = token.token.replace(/ /g, '·').replace(/\n/g, '↵');
+ const percentageValue = ((token.count / tokenStats.total_activations) * 100);
+ const percentage = percentageValue.toFixed(1);
+
+ // Color based on percentage (normalized by max percentage)
+ const normalizedPct = maxPercentage > 0 ? percentageValue / maxPercentage : 0;
+ const intensity = Math.floor((1 - normalizedPct) * 255);
+ const bgColor = `rgb(255, ${intensity}, ${intensity})`;
+
+ const line = document.createElement('div');
+ line.style.display = 'flex';
+ line.style.justifyContent = 'space-between';
+ line.style.gap = '8px';
+
+ const tokenSpan = document.createElement('span');
+ tokenSpan.innerHTML = `${tokenDisplay}`;
+ tokenSpan.style.textAlign = 'left';
+
+ const pctSpan = document.createElement('span');
+ pctSpan.textContent = `${percentage}%`;
+ pctSpan.style.textAlign = 'right';
+ pctSpan.style.backgroundColor = bgColor;
+ pctSpan.style.padding = '2px 4px';
+ pctSpan.style.borderRadius = '2px';
+
+ line.appendChild(tokenSpan);
+ line.appendChild(pctSpan);
+ container.appendChild(line);
+ }
+
+ return container;
+ },
+
+ // Generic histogram renderer for any BinnedData stat
+ genericHistogram: function(statKey, color, title) {
+ return function(value, row, col) {
+ const histData = row.stats[statKey];
+ if (!histData || !histData.bin_counts) {
+ return 'No data ';
+ }
+
+ const container = document.createElement('div');
+ container.className = 'sparkline-cell';
+
+ // Calculate bin centers for x-axis
+ const binCenters = calculateBinCenters(histData.bin_edges);
+
+ // Calculate statistics of underlying data
+ const min = histData.bin_edges[0];
+ const max = histData.bin_edges[histData.bin_edges.length - 1];
+
+ // Set x-axis limits to [0, 1] if data is in that range
+ const xlims = (min >= 0 && max <= 1) ? [0, 1] : null;
+
+ // Pass bin centers as x-values and counts as y-values
+ const svg = sparkbars(binCenters, histData.bin_counts, {
+ width: CONFIG.visualization.sparklineWidth,
+ height: CONFIG.visualization.sparklineHeight,
+ color: color,
+ shading: true,
+ lineWidth: 0,
+ markers: '',
+ margin: 2,
+ xlims: xlims,
+ ylims: [0, null],
+ logScale: true,
+ xAxis: {line: true, ticks: true, label_margin: 10},
+ yAxis: {line: true, ticks: true, label_margin: CONFIG.visualization.sparklineYAxisMargin}
+ });
+
+ container.innerHTML = svg;
+
+ const mean = calculateHistogramMean(histData);
+ const median = calculateHistogramMedian(histData);
+ const totalCount = histData.bin_counts.reduce((a, b) => a + b, 0);
+
+ container.title = `${title} (n=${totalCount})\n\nMin: ${min.toFixed(4)}\nMax: ${max.toFixed(4)}\nMean: ${mean.toFixed(4)}\nMedian: ${median.toFixed(4)}`;
+
+ return container;
+ };
+ }
+};
+
+// ============================================================================
+// Helper Functions for Filtering and Sorting
+// ============================================================================
+
+/**
+ * Create a filter function for module arrays that supports wildcards, multiple patterns, and negation
+ * @param {string} filterValue - The filter pattern (supports * wildcards, , for OR, & for AND, @ for all-match, ! for negation)
+ * @returns {Function|null} Filter function or null if invalid
+ */
+function createModuleFilter(filterValue) {
+ if (!filterValue || !filterValue.trim()) return null;
+
+ // Split by comma for OR groups
+ const orGroups = filterValue.split(',').map(g => g.trim()).filter(g => g);
+
+ // Parse each OR group (which may contain & for AND)
+ const parsedOrGroups = orGroups.map(group => {
+ // Split by & for AND conditions within this OR group
+ const andConditions = group.split('&').map(c => c.trim()).filter(c => c);
+
+ return andConditions.map(condition => {
+ let mode = 'some'; // default: at least one module matches
+ let negate = false;
+ let pattern = condition.toLowerCase();
+
+ // Check for @ prefix (all modules must match)
+ if (pattern.startsWith('@')) {
+ mode = 'every';
+ pattern = pattern.substring(1);
+ }
+ // Check for ! prefix (no modules can match)
+ else if (pattern.startsWith('!')) {
+ negate = true;
+ pattern = pattern.substring(1);
+ }
+
+ const regex = pattern.includes('*')
+ ? new RegExp('^' + pattern.replace(/\*/g, '.*') + '$')
+ : null;
+
+ return { mode, negate, pattern, regex };
+ });
+ });
+
+ return (cellValue) => {
+ // cellValue is the modules array
+ if (!Array.isArray(cellValue)) return false;
+
+ // OR logic across groups
+ return parsedOrGroups.some(andGroup => {
+ // AND logic within group
+ return andGroup.every(condition => {
+ const matchFn = (module) => {
+ const moduleLower = module.toLowerCase();
+ return condition.regex
+ ? condition.regex.test(moduleLower)
+ : moduleLower.includes(condition.pattern);
+ };
+
+ if (condition.mode === 'every') {
+ // ALL modules must match
+ const result = cellValue.every(matchFn);
+ return condition.negate ? !result : result;
+ } else {
+ // At least ONE module must match (or none if negated)
+ const result = cellValue.some(matchFn);
+ return condition.negate ? !result : result;
+ }
+ });
+ });
+ };
+}
+
+/**
+ * Sort function for module arrays
+ * Primary: number of modules (ascending)
+ * Secondary: alphabetically by first module name
+ * @param {Array} modules - Array of module names
+ * @returns {string} Sortable string representation
+ */
+function sortModules(modules) {
+ if (!Array.isArray(modules) || modules.length === 0) return '';
+
+ // Pad module count for proper numeric sorting, then add first module name
+ const count = modules.length.toString().padStart(5, '0');
+ const firstName = modules[0].toLowerCase();
+ return `${count}_${firstName}`;
+}
+
+/**
+ * Parse extended histogram filter syntax (e.g., "mean>0.5", "max<10", "mean>0.5, max<10")
+ * @param {string} filterValue - The filter string (can be comma-separated for multiple conditions)
+ * @returns {Array|null} Array of parsed filters [{ statType, operator, value }] or null if plain numeric
+ */
+function parseHistogramFilter(filterValue) {
+ const trimmed = filterValue.trim();
+ if (!trimmed) return null;
+
+ // Split by comma to support multiple conditions
+ const conditions = trimmed.split(',').map(c => c.trim());
+ const parsedConditions = [];
+
+ for (const condition of conditions) {
+ // Match pattern: statType operator value (e.g., "mean>0.5", "median<=0.2")
+ const match = condition.match(/^(mean|median|max|min|range|sum)\s*(==|!=|>=|<=|>|<)\s*(-?\d+\.?\d*)$/i);
+
+ if (match) {
+ parsedConditions.push({
+ statType: match[1].toLowerCase(),
+ operator: match[2],
+ value: parseFloat(match[3])
+ });
+ } else {
+ // If any condition doesn't match, return null to use default filter
+ return null;
+ }
+ }
+
+ // Return array of conditions, or null if none were found
+ return parsedConditions.length > 0 ? parsedConditions : null;
+}
+
+/**
+ * Create a filter function for histogram columns with extended syntax
+ * Supports multiple comma-separated conditions (AND logic)
+ * @param {string} statKey - The statistics key
+ * @param {string} filterValue - The filter string (e.g., "mean>0.5, max<10")
+ * @returns {Function|null} Filter function or null to use default
+ */
+function createHistogramFilter(statKey, filterValue) {
+ const parsedConditions = parseHistogramFilter(filterValue);
+
+ if (!parsedConditions) {
+ // Return null to let default numeric filter handle it
+ // Default will filter on the sort value (mean by default)
+ return null;
+ }
+
+ return (cellValue, row) => {
+ // All conditions must be satisfied (AND logic)
+ for (const condition of parsedConditions) {
+ const { statType, operator, value } = condition;
+ const histData = row.stats[statKey];
+
+ if (!histData || !histData.bin_counts || !histData.bin_edges) return false;
+
+ // Calculate the requested statistic
+ let statValue;
+ switch (statType) {
+ case 'mean':
+ // For all_activations, use precomputed mean
+ if (statKey === 'all_activations' && row.stats.mean_activation !== undefined) {
+ statValue = row.stats.mean_activation;
+ } else {
+ statValue = calculateHistogramMean(histData);
+ }
+ break;
+ case 'median':
+ statValue = calculateHistogramMedian(histData);
+ break;
+ case 'max':
+ statValue = histData.bin_edges[histData.bin_edges.length - 1];
+ break;
+ case 'min':
+ statValue = histData.bin_edges[0];
+ break;
+ case 'range':
+ statValue = histData.bin_edges[histData.bin_edges.length - 1] - histData.bin_edges[0];
+ break;
+ case 'sum':
+ statValue = histData.bin_counts.reduce((a, b) => a + b, 0);
+ break;
+ default:
+ return false;
+ }
+
+ if (statValue === null || statValue === undefined) return false;
+
+ let conditionMet = false;
+ switch (operator) {
+ case '==': conditionMet = Math.abs(statValue - value) < 0.0001; break;
+ case '!=': conditionMet = Math.abs(statValue - value) >= 0.0001; break;
+ case '>': conditionMet = statValue > value; break;
+ case '<': conditionMet = statValue < value; break;
+ case '>=': conditionMet = statValue >= value; break;
+ case '<=': conditionMet = statValue <= value; break;
+ default: conditionMet = false;
+ }
+
+ // If any condition fails, return false
+ if (!conditionMet) return false;
+ }
+
+ // All conditions passed
+ return true;
+ };
+}
+
+/**
+ * Get the top token string for sorting
+ * @param {object} value - Cell value (stats object)
+ * @param {object} row - The data row
+ * @returns {string} The top token string for sorting
+ */
+function sortTopToken(value, row) {
+ const tokenStats = row.stats.token_activations;
+ if (!tokenStats || !tokenStats.top_tokens || tokenStats.top_tokens.length === 0) {
+ return '';
+ }
+ return tokenStats.top_tokens[0].token.toLowerCase();
+}
+
+/**
+ * Create a filter function for top tokens
+ * @param {string} filterValue - The filter string
+ * @returns {Function|null} Filter function or null if invalid
+ */
+function createTopTokenFilter(filterValue) {
+ if (!filterValue || !filterValue.trim()) return null;
+
+ const pattern = filterValue.toLowerCase().trim();
+
+ return (cellValue, row) => {
+ const tokenStats = row.stats.token_activations;
+ if (!tokenStats || !tokenStats.top_tokens) return false;
+
+ // Search in top 10 tokens
+ const topN = Math.min(10, tokenStats.top_tokens.length);
+ for (let i = 0; i < topN; i++) {
+ const token = tokenStats.top_tokens[i].token.toLowerCase();
+ if (token.includes(pattern)) {
+ return true;
+ }
+ }
+ return false;
+ };
+}
+
+/**
+ * Create a filter function for numeric comparisons with operators
+ * @param {string} filterValue - The filter string (e.g., ">2.5", "<=0.8")
+ * @param {Function} valueExtractor - Function to extract numeric value from cellValue
+ * @returns {Function|null} Filter function or null if invalid
+ */
+function createNumericFilter(filterValue, valueExtractor) {
+ if (!filterValue || !filterValue.trim()) return null;
+
+ const trimmed = filterValue.trim();
+
+ // Match pattern: operator value (e.g., ">2.5", "<=0.8")
+ const match = trimmed.match(/^(==|!=|>=|<=|>|<)\s*(-?\d+\.?\d*)$/);
+
+ if (!match) {
+ // Try plain number (defaults to ==)
+ const plainNum = parseFloat(trimmed);
+ if (!isNaN(plainNum)) {
+ return (cellValue, row) => {
+ const value = valueExtractor(cellValue);
+ if (value === null || value === undefined) return false;
+ return Math.abs(value - plainNum) < 0.0001;
+ };
+ }
+ return null;
+ }
+
+ const operator = match[1];
+ const targetValue = parseFloat(match[2]);
+
+ return (cellValue, row) => {
+ const value = valueExtractor(cellValue);
+ if (value === null || value === undefined) return false;
+
+ switch (operator) {
+ case '==': return Math.abs(value - targetValue) < 0.0001;
+ case '!=': return Math.abs(value - targetValue) >= 0.0001;
+ case '>': return value > targetValue;
+ case '<': return value < targetValue;
+ case '>=': return value >= targetValue;
+ case '<=': return value <= targetValue;
+ default: return false;
+ }
+ };
+}
+
+async function processClusterData() {
+ const tableData = [];
+
+ for (const [clusterHash, cluster] of Object.entries(clusterData)) {
+ // Await lazy-loaded components
+ const components = await cluster.components;
+
+ const modules = new Set();
+ components.forEach(comp => {
+ modules.add(comp.module);
+ });
+
+ const stats = cluster.stats;
+
+ // Extract cluster ID from hash (format: "runid-iteration-clusteridx")
+ const parts = clusterHash.split('-');
+ const clusterId = parseInt(parts[parts.length - 1]);
+
+ // TODO: Re-enable explanations feature
+ // Get explanation for this cluster
+ // const explanationData = explanations[clusterHash];
+ // const explanation = explanationData ? explanationData.explanation : null;
+
+ tableData.push({
+ id: clusterId,
+ clusterHash: clusterHash,
+ componentCount: components.length,
+ modules: Array.from(modules),
+ stats: stats,
+ // TODO: Re-enable explanations feature
+ // explanation: explanation
+ });
+ }
+
+ return tableData;
+}
+
+async function loadData() {
+ // Load data via ZANJ
+ const loader = new ZanjLoader(CONFIG.data.dataDir);
+ const data = await loader.read();
+
+ // Extract data
+ clusterData = data.clusters;
+ modelInfo = data.model_info;
+
+ // TODO: Re-enable explanations feature
+ // Load explanations separately (not part of ZANJ)
+ // explanations = await loadJSONL(CONFIG.getDataPath('explanations'), 'cluster_id').catch(() => ({}));
+
+ const tableData = await processClusterData();
+
+ // Discover histogram stats from first cluster
+ const firstCluster = Object.values(clusterData)[0];
+ const histogramStats = [];
+ if (firstCluster && firstCluster.stats) {
+ for (const [key, value] of Object.entries(firstCluster.stats)) {
+ if (value && typeof value === 'object' && 'bin_counts' in value && 'bin_edges' in value) {
+ histogramStats.push(key);
+ }
+ }
+ }
+
+ // Base columns
+ const columns = [
+ {
+ key: 'id',
+ label: 'ID',
+ type: 'number',
+ width: '10px',
+ align: 'center'
+ },
+ {
+ key: 'componentCount',
+ label: 'Comps',
+ type: 'number',
+ width: '10px',
+ align: 'right'
+ },
+ {
+ key: 'modules',
+ label: 'Model View',
+ type: 'string',
+ width: '21px',
+ align: 'center',
+ renderer: columnRenderers.modelView,
+ sortFunction: (modules) => sortModules(modules),
+ filterFunction: (filterValue) => createModuleFilter(filterValue),
+ filterTooltip: 'Filter by module. Separate with , (OR) or & (AND). Use * for wildcards. Prefix @ for all-match, ! to exclude. Examples: *mlp*,*attn* (OR), *mlp*&*attn* (AND), @*proj* (all), !*o_proj* (exclude)'
+ },
+ {
+ key: 'modules',
+ label: 'Modules',
+ type: 'string',
+ width: '10px',
+ renderer: columnRenderers.modulesSummary,
+ sortFunction: (modules) => sortModules(modules),
+ filterFunction: (filterValue) => createModuleFilter(filterValue),
+ filterTooltip: 'Filter by module. Separate with , (OR) or & (AND). Use * for wildcards. Prefix @ for all-match, ! to exclude. Examples: *mlp*,*attn* (OR), *mlp*&*attn* (AND), @*proj* (all), !*o_proj* (exclude)'
+ }
+ ];
+
+ // Add histogram columns dynamically
+ const statColors = {
+ 'all_activations': '#4169E1',
+ 'max_activation-max-16': '#DC143C',
+ 'max_activation-max-32': '#DC143C',
+ 'mean_activation-max-16': '#228B22',
+ 'median_activation-max-16': '#FF8C00',
+ 'min_activation-max-16': '#9370DB',
+ 'max_activation_position': '#FF6347'
+ };
+
+ histogramStats.forEach(statKey => {
+ const color = statColors[statKey] || '#808080';
+ const label = statKey.replace(/-/g, ' ').replace(/_/g, ' ')
+ .split(' ')
+ .map(word => word.charAt(0).toUpperCase() + word.slice(1))
+ .join(' ');
+
+ columns.push({
+ id: 'histogram_' + statKey,
+ key: 'stats',
+ label: label,
+ type: 'number',
+ width: '200px',
+ align: 'center',
+ renderer: columnRenderers.genericHistogram(statKey, color, label),
+ sortFunction: (value, row) => {
+ const histData = row.stats[statKey];
+ if (!histData || !histData.bin_counts || !histData.bin_edges) return -Infinity;
+ // For all_activations, use precomputed mean
+ if (statKey === 'all_activations' && row.stats.mean_activation !== undefined) {
+ return row.stats.mean_activation;
+ }
+ // Otherwise calculate mean from histogram
+ return calculateHistogramMean(histData);
+ },
+ filterFunction: (filterValue) => createHistogramFilter(statKey, filterValue),
+ filterTooltip: 'Filter by statistics. Use: mean>0.5, median<0.2, max>=1.0, min>-0.1, range<5, sum>100. Combine with commas (e.g., mean>0.5, max<10)'
+ });
+ });
+
+ // Token activation columns
+ columns.push({
+ id: 'top_tokens',
+ key: 'stats',
+ label: 'Top Tokens',
+ type: 'string',
+ width: '150px',
+ align: 'left',
+ renderer: columnRenderers.topToken,
+ sortFunction: (value, row) => sortTopToken(value, row),
+ filterFunction: (filterValue) => createTopTokenFilter(filterValue),
+ filterTooltip: 'Search for tokens (case-insensitive substring match)'
+ });
+
+ columns.push({
+ id: 'token_entropy',
+ key: 'stats',
+ label: 'Token Entropy',
+ type: 'number',
+ width: '60px',
+ align: 'right',
+ renderer: columnRenderers.tokenEntropy,
+ sortFunction: (value, row) => {
+ const tokenStats = row.stats.token_activations;
+ return tokenStats ? tokenStats.entropy : -Infinity;
+ },
+ filterFunction: (filterValue) => createNumericFilter(filterValue, (stats) => {
+ const tokenStats = stats?.token_activations;
+ return tokenStats ? tokenStats.entropy : null;
+ }),
+ filterTooltip: 'Filter by entropy. Use operators: >, <, >=, <=, ==, != (e.g., >2.5)'
+ });
+
+ columns.push({
+ id: 'token_concentration',
+ key: 'stats',
+ label: 'Token Conc.',
+ type: 'number',
+ width: '60px',
+ align: 'right',
+ renderer: columnRenderers.tokenConcentration,
+ sortFunction: (value, row) => {
+ const tokenStats = row.stats.token_activations;
+ return tokenStats ? tokenStats.concentration_ratio : -Infinity;
+ },
+ filterFunction: (filterValue) => createNumericFilter(filterValue, (stats) => {
+ const tokenStats = stats?.token_activations;
+ return tokenStats ? tokenStats.concentration_ratio : null;
+ }),
+ filterTooltip: 'Filter by concentration (0-1). Use operators: >, <, >=, <=, ==, != (e.g., >0.5)'
+ });
+
+ // TODO: Re-enable explanations feature
+ // Explanation column
+ // columns.push({
+ // key: 'explanation',
+ // label: 'Explanation',
+ // type: 'string',
+ // width: '200px',
+ // align: 'left',
+ // renderer: columnRenderers.explanation,
+ // filterTooltip: 'Filter by explanation text (case-insensitive substring match)'
+ // });
+
+ // Actions column
+ columns.push({
+ key: 'id',
+ label: 'Actions',
+ type: 'string',
+ width: '20px',
+ align: 'center',
+ renderer: columnRenderers.clusterLink,
+ filterable: false
+ });
+
+ const tableConfig = {
+ data: tableData,
+ columns: columns,
+ pageSize: CONFIG.indexPage.pageSize,
+ pageSizeOptions: CONFIG.indexPage.pageSizeOptions,
+ showFilters: CONFIG.indexPage.showFilters
+ };
+
+ dataTable = new DataTable('#clusterTableContainer', tableConfig);
+
+ const loading = document.getElementById('loading');
+ if (!loading) {
+ const msg = 'Fatal error: loading element not found in HTML';
+ NOTIF.error(msg, null, null);
+ console.error(msg);
+ return;
+ }
+ loading.style.display = 'none';
+}
+
+document.addEventListener('DOMContentLoaded', async () => {
+ await initConfig();
+
+ // Check if Alpine.js loaded
+ if (typeof Alpine === 'undefined') {
+ const msg = 'Fatal error: Alpine.js failed to load. Check your internet connection or CDN.';
+ NOTIF.error(msg, null, null);
+ console.error(msg);
+ }
+
+ // Load cluster data and render table (includes model info from ZANJ)
+ await loadData();
+
+ // Populate Alpine.js component with loaded model info
+ const modelInfoEl = document.getElementById('modelInfo');
+ if (modelInfoEl && Alpine.$data(modelInfoEl)) {
+ Alpine.$data(modelInfoEl).data = modelInfo;
+ Alpine.$data(modelInfoEl).hasData = Object.keys(modelInfo).length > 0;
+ }
+});
diff --git a/spd/clustering/dashboard/js/model-visualization.js b/spd/clustering/dashboard/js/model-visualization.js
new file mode 100644
index 000000000..ee5190c9a
--- /dev/null
+++ b/spd/clustering/dashboard/js/model-visualization.js
@@ -0,0 +1,225 @@
+// Self-contained utilities for model visualization
+// No global variables, all functions take necessary data as parameters
+
+async function getClusterModuleStats(clusterId, clusterData) {
+ if (!clusterData || !clusterData[clusterId]) return {};
+
+ const cluster = clusterData[clusterId];
+ const moduleStats = {};
+
+ // Await lazy-loaded components
+ const components = await cluster.components;
+
+ // Count components per module for this specific cluster
+ components.forEach(comp => {
+ const module = comp.module;
+ if (!moduleStats[module]) {
+ moduleStats[module] = {
+ componentCount: 0,
+ components: []
+ };
+ }
+ moduleStats[module].componentCount++;
+ moduleStats[module].components.push(comp);
+ });
+
+ return moduleStats;
+}
+
+function getModuleOrder(moduleName) {
+ if (moduleName.includes('q_proj')) return 0;
+ if (moduleName.includes('k_proj')) return 1;
+ if (moduleName.includes('v_proj')) return 2;
+ if (moduleName.includes('o_proj')) return 3;
+ if (moduleName.includes('gate_proj')) return 10;
+ if (moduleName.includes('up_proj')) return 11;
+ if (moduleName.includes('down_proj')) return 12;
+ return 999;
+}
+
+async function renderModelArchitecture(clusterId, clusterData, modelInfo, colormap = 'blues') {
+ if (!modelInfo || !modelInfo.module_list) {
+ throw new Error('Model info not loaded');
+ }
+
+ const moduleStats = clusterData && clusterData[clusterId] ? await getClusterModuleStats(clusterId, clusterData) : {};
+ const maxComponents = Math.max(...Object.values(moduleStats).map(s => s.componentCount), 1);
+
+ // Group ALL modules from model_info by layer and type
+ const layerGroups = {};
+
+ modelInfo.module_list.forEach(moduleName => {
+ const parts = moduleName.split('.');
+ let layerNum = -1;
+ let moduleType = 'other';
+
+ for (let i = 0; i < parts.length; i++) {
+ if (parts[i] === 'layers' && i + 1 < parts.length) {
+ layerNum = parseInt(parts[i + 1]);
+ }
+ }
+
+ if (moduleName.includes('self_attn')) {
+ moduleType = 'attention';
+ } else if (moduleName.includes('mlp')) {
+ moduleType = 'mlp';
+ }
+
+ if (!layerGroups[layerNum]) {
+ layerGroups[layerNum] = { attention: [], mlp: [], other: [] };
+ }
+
+ const count = moduleStats[moduleName] ? moduleStats[moduleName].componentCount : 0;
+ const components = moduleStats[moduleName] ? moduleStats[moduleName].components : [];
+
+ layerGroups[layerNum][moduleType].push({
+ name: moduleName,
+ count: count,
+ components: components
+ });
+ });
+
+ // Sort modules within each group by desired order
+ Object.values(layerGroups).forEach(layer => {
+ layer.attention.sort((a, b) => getModuleOrder(a.name) - getModuleOrder(b.name));
+ layer.mlp.sort((a, b) => getModuleOrder(a.name) - getModuleOrder(b.name));
+ });
+
+ const sortedLayers = Object.keys(layerGroups).sort((a, b) => a - b);
+ const cellSize = 12;
+
+ const moduleElements = [];
+
+ sortedLayers.forEach(layerNum => {
+ const layer = layerGroups[layerNum];
+ const layerElements = [];
+
+ // Attention row (above MLP)
+ if (layer.attention.length > 0) {
+ const attentionRow = layer.attention.map(module => ({
+ type: 'cell',
+ module: module.name,
+ count: module.count,
+ components: module.components.map(c => c.index).join(','),
+ color: getColorForValue(module.count, maxComponents, colormap),
+ size: cellSize
+ }));
+ layerElements.push({ type: 'row', cells: attentionRow });
+ }
+
+ // MLP row (below attention)
+ if (layer.mlp.length > 0) {
+ const mlpRow = layer.mlp.map(module => ({
+ type: 'cell',
+ module: module.name,
+ count: module.count,
+ components: module.components.map(c => c.index).join(','),
+ color: getColorForValue(module.count, maxComponents, colormap),
+ size: cellSize
+ }));
+ layerElements.push({ type: 'row', cells: mlpRow });
+ }
+
+ // Other modules
+ if (layer.other.length > 0) {
+ const otherRow = layer.other.map(module => ({
+ type: 'cell',
+ module: module.name,
+ count: module.count,
+ components: module.components.map(c => c.index).join(','),
+ color: getColorForValue(module.count, maxComponents, colormap),
+ size: cellSize
+ }));
+ layerElements.push({ type: 'row', cells: otherRow });
+ }
+
+ if (layerElements.length > 0) {
+ moduleElements.push({ type: 'layer', rows: layerElements });
+ }
+ });
+
+ return {
+ elements: moduleElements,
+ maxComponents: maxComponents
+ };
+}
+
+function renderToHTML(architecture) {
+ let html = '';
+
+ architecture.elements.forEach(layer => {
+ html += '';
+ layer.rows.forEach(row => {
+ html += '
';
+ row.cells.forEach(cell => {
+ html += `
`;
+ });
+ html += '
';
+ });
+ html += '
';
+ });
+
+ return html;
+}
+
+// Consolidated tooltip setup - works for all model visualizations
+function setupTooltips(containerElement) {
+ const tooltip = document.getElementById('tooltip');
+ if (!tooltip) return;
+
+ const cells = containerElement.querySelectorAll('.modelview-module-cell');
+
+ cells.forEach(cell => {
+ cell.addEventListener('mouseenter', (e) => {
+ const module = e.target.dataset.module;
+ const count = e.target.dataset.count;
+ const components = e.target.dataset.components;
+
+ if (module) {
+ tooltip.textContent = `${module}\nComponents: ${count}\nIndices: ${components || 'none'}`;
+ tooltip.style.display = 'block';
+ tooltip.style.left = (e.pageX + 10) + 'px';
+ tooltip.style.top = (e.pageY + 10) + 'px';
+ }
+ });
+
+ cell.addEventListener('mouseleave', () => {
+ tooltip.style.display = 'none';
+ });
+
+ cell.addEventListener('mousemove', (e) => {
+ tooltip.style.left = (e.pageX + 10) + 'px';
+ tooltip.style.top = (e.pageY + 10) + 'px';
+ });
+ });
+}
+
+// Consolidated render function - creates model visualization in a container
+async function renderModelView(containerElement, clusterHash, clusterData, modelInfo, colormap = 'blues', cellSize = null) {
+ if (!modelInfo || !modelInfo.module_list) {
+ containerElement.innerHTML = 'Model info loading... ';
+ return;
+ }
+
+ if (!clusterData || !clusterData[clusterHash]) {
+ containerElement.innerHTML = 'Cluster data missing ';
+ return;
+ }
+
+ try {
+ const architecture = await renderModelArchitecture(clusterHash, clusterData, modelInfo, colormap);
+ const html = renderToHTML(architecture);
+ containerElement.innerHTML = html;
+
+ // Apply cell size from config if provided
+ if (cellSize !== null) {
+ containerElement.style.setProperty('--modelview-cell-size', cellSize + 'px');
+ }
+
+ // Setup tooltips after a brief delay to ensure DOM is ready
+ setTimeout(() => setupTooltips(containerElement), 0);
+ } catch (error) {
+ console.error('Failed to render model visualization:', error);
+ containerElement.innerHTML = 'Model visualization error ';
+ }
+}
\ No newline at end of file
diff --git a/spd/clustering/dashboard/js/pkg/jszip.js b/spd/clustering/dashboard/js/pkg/jszip.js
new file mode 100644
index 000000000..60fbb41a6
--- /dev/null
+++ b/spd/clustering/dashboard/js/pkg/jszip.js
@@ -0,0 +1,11577 @@
+/*!
+
+JSZip v3.10.1 - A JavaScript class for generating and reading zip files
+
+
+(c) 2009-2016 Stuart Knightley
+Dual licenced under the MIT license or GPLv3. See https://raw.github.com/Stuk/jszip/main/LICENSE.markdown.
+
+JSZip uses the library pako released under the MIT license :
+https://github.com/nodeca/pako/blob/main/LICENSE
+*/
+
+(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.JSZip = f()}})(function(){var define,module,exports;return (function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o> 2;
+ enc2 = ((chr1 & 3) << 4) | (chr2 >> 4);
+ enc3 = remainingBytes > 1 ? (((chr2 & 15) << 2) | (chr3 >> 6)) : 64;
+ enc4 = remainingBytes > 2 ? (chr3 & 63) : 64;
+
+ output.push(_keyStr.charAt(enc1) + _keyStr.charAt(enc2) + _keyStr.charAt(enc3) + _keyStr.charAt(enc4));
+
+ }
+
+ return output.join("");
+};
+
+// public method for decoding
+exports.decode = function(input) {
+ var chr1, chr2, chr3;
+ var enc1, enc2, enc3, enc4;
+ var i = 0, resultIndex = 0;
+
+ var dataUrlPrefix = "data:";
+
+ if (input.substr(0, dataUrlPrefix.length) === dataUrlPrefix) {
+ // This is a common error: people give a data url
+ // (data:image/png;base64,iVBOR...) with a {base64: true} and
+ // wonders why things don't work.
+ // We can detect that the string input looks like a data url but we
+ // *can't* be sure it is one: removing everything up to the comma would
+ // be too dangerous.
+ throw new Error("Invalid base64 input, it looks like a data url.");
+ }
+
+ input = input.replace(/[^A-Za-z0-9+/=]/g, "");
+
+ var totalLength = input.length * 3 / 4;
+ if(input.charAt(input.length - 1) === _keyStr.charAt(64)) {
+ totalLength--;
+ }
+ if(input.charAt(input.length - 2) === _keyStr.charAt(64)) {
+ totalLength--;
+ }
+ if (totalLength % 1 !== 0) {
+ // totalLength is not an integer, the length does not match a valid
+ // base64 content. That can happen if:
+ // - the input is not a base64 content
+ // - the input is *almost* a base64 content, with a extra chars at the
+ // beginning or at the end
+ // - the input uses a base64 variant (base64url for example)
+ throw new Error("Invalid base64 input, bad content length.");
+ }
+ var output;
+ if (support.uint8array) {
+ output = new Uint8Array(totalLength|0);
+ } else {
+ output = new Array(totalLength|0);
+ }
+
+ while (i < input.length) {
+
+ enc1 = _keyStr.indexOf(input.charAt(i++));
+ enc2 = _keyStr.indexOf(input.charAt(i++));
+ enc3 = _keyStr.indexOf(input.charAt(i++));
+ enc4 = _keyStr.indexOf(input.charAt(i++));
+
+ chr1 = (enc1 << 2) | (enc2 >> 4);
+ chr2 = ((enc2 & 15) << 4) | (enc3 >> 2);
+ chr3 = ((enc3 & 3) << 6) | enc4;
+
+ output[resultIndex++] = chr1;
+
+ if (enc3 !== 64) {
+ output[resultIndex++] = chr2;
+ }
+ if (enc4 !== 64) {
+ output[resultIndex++] = chr3;
+ }
+
+ }
+
+ return output;
+};
+
+},{"./support":30,"./utils":32}],2:[function(require,module,exports){
+"use strict";
+
+var external = require("./external");
+var DataWorker = require("./stream/DataWorker");
+var Crc32Probe = require("./stream/Crc32Probe");
+var DataLengthProbe = require("./stream/DataLengthProbe");
+
+/**
+ * Represent a compressed object, with everything needed to decompress it.
+ * @constructor
+ * @param {number} compressedSize the size of the data compressed.
+ * @param {number} uncompressedSize the size of the data after decompression.
+ * @param {number} crc32 the crc32 of the decompressed file.
+ * @param {object} compression the type of compression, see lib/compressions.js.
+ * @param {String|ArrayBuffer|Uint8Array|Buffer} data the compressed data.
+ */
+function CompressedObject(compressedSize, uncompressedSize, crc32, compression, data) {
+ this.compressedSize = compressedSize;
+ this.uncompressedSize = uncompressedSize;
+ this.crc32 = crc32;
+ this.compression = compression;
+ this.compressedContent = data;
+}
+
+CompressedObject.prototype = {
+ /**
+ * Create a worker to get the uncompressed content.
+ * @return {GenericWorker} the worker.
+ */
+ getContentWorker: function () {
+ var worker = new DataWorker(external.Promise.resolve(this.compressedContent))
+ .pipe(this.compression.uncompressWorker())
+ .pipe(new DataLengthProbe("data_length"));
+
+ var that = this;
+ worker.on("end", function () {
+ if (this.streamInfo["data_length"] !== that.uncompressedSize) {
+ throw new Error("Bug : uncompressed data size mismatch");
+ }
+ });
+ return worker;
+ },
+ /**
+ * Create a worker to get the compressed content.
+ * @return {GenericWorker} the worker.
+ */
+ getCompressedWorker: function () {
+ return new DataWorker(external.Promise.resolve(this.compressedContent))
+ .withStreamInfo("compressedSize", this.compressedSize)
+ .withStreamInfo("uncompressedSize", this.uncompressedSize)
+ .withStreamInfo("crc32", this.crc32)
+ .withStreamInfo("compression", this.compression)
+ ;
+ }
+};
+
+/**
+ * Chain the given worker with other workers to compress the content with the
+ * given compression.
+ * @param {GenericWorker} uncompressedWorker the worker to pipe.
+ * @param {Object} compression the compression object.
+ * @param {Object} compressionOptions the options to use when compressing.
+ * @return {GenericWorker} the new worker compressing the content.
+ */
+CompressedObject.createWorkerFrom = function (uncompressedWorker, compression, compressionOptions) {
+ return uncompressedWorker
+ .pipe(new Crc32Probe())
+ .pipe(new DataLengthProbe("uncompressedSize"))
+ .pipe(compression.compressWorker(compressionOptions))
+ .pipe(new DataLengthProbe("compressedSize"))
+ .withStreamInfo("compression", compression);
+};
+
+module.exports = CompressedObject;
+
+},{"./external":6,"./stream/Crc32Probe":25,"./stream/DataLengthProbe":26,"./stream/DataWorker":27}],3:[function(require,module,exports){
+"use strict";
+
+var GenericWorker = require("./stream/GenericWorker");
+
+exports.STORE = {
+ magic: "\x00\x00",
+ compressWorker : function () {
+ return new GenericWorker("STORE compression");
+ },
+ uncompressWorker : function () {
+ return new GenericWorker("STORE decompression");
+ }
+};
+exports.DEFLATE = require("./flate");
+
+},{"./flate":7,"./stream/GenericWorker":28}],4:[function(require,module,exports){
+"use strict";
+
+var utils = require("./utils");
+
+/**
+ * The following functions come from pako, from pako/lib/zlib/crc32.js
+ * released under the MIT license, see pako https://github.com/nodeca/pako/
+ */
+
+// Use ordinary array, since untyped makes no boost here
+function makeTable() {
+ var c, table = [];
+
+ for(var n =0; n < 256; n++){
+ c = n;
+ for(var k =0; k < 8; k++){
+ c = ((c&1) ? (0xEDB88320 ^ (c >>> 1)) : (c >>> 1));
+ }
+ table[n] = c;
+ }
+
+ return table;
+}
+
+// Create table on load. Just 255 signed longs. Not a problem.
+var crcTable = makeTable();
+
+
+function crc32(crc, buf, len, pos) {
+ var t = crcTable, end = pos + len;
+
+ crc = crc ^ (-1);
+
+ for (var i = pos; i < end; i++ ) {
+ crc = (crc >>> 8) ^ t[(crc ^ buf[i]) & 0xFF];
+ }
+
+ return (crc ^ (-1)); // >>> 0;
+}
+
+// That's all for the pako functions.
+
+/**
+ * Compute the crc32 of a string.
+ * This is almost the same as the function crc32, but for strings. Using the
+ * same function for the two use cases leads to horrible performances.
+ * @param {Number} crc the starting value of the crc.
+ * @param {String} str the string to use.
+ * @param {Number} len the length of the string.
+ * @param {Number} pos the starting position for the crc32 computation.
+ * @return {Number} the computed crc32.
+ */
+function crc32str(crc, str, len, pos) {
+ var t = crcTable, end = pos + len;
+
+ crc = crc ^ (-1);
+
+ for (var i = pos; i < end; i++ ) {
+ crc = (crc >>> 8) ^ t[(crc ^ str.charCodeAt(i)) & 0xFF];
+ }
+
+ return (crc ^ (-1)); // >>> 0;
+}
+
+module.exports = function crc32wrapper(input, crc) {
+ if (typeof input === "undefined" || !input.length) {
+ return 0;
+ }
+
+ var isArray = utils.getTypeOf(input) !== "string";
+
+ if(isArray) {
+ return crc32(crc|0, input, input.length, 0);
+ } else {
+ return crc32str(crc|0, input, input.length, 0);
+ }
+};
+
+},{"./utils":32}],5:[function(require,module,exports){
+"use strict";
+exports.base64 = false;
+exports.binary = false;
+exports.dir = false;
+exports.createFolders = true;
+exports.date = null;
+exports.compression = null;
+exports.compressionOptions = null;
+exports.comment = null;
+exports.unixPermissions = null;
+exports.dosPermissions = null;
+
+},{}],6:[function(require,module,exports){
+"use strict";
+
+// load the global object first:
+// - it should be better integrated in the system (unhandledRejection in node)
+// - the environment may have a custom Promise implementation (see zone.js)
+var ES6Promise = null;
+if (typeof Promise !== "undefined") {
+ ES6Promise = Promise;
+} else {
+ ES6Promise = require("lie");
+}
+
+/**
+ * Let the user use/change some implementations.
+ */
+module.exports = {
+ Promise: ES6Promise
+};
+
+},{"lie":37}],7:[function(require,module,exports){
+"use strict";
+var USE_TYPEDARRAY = (typeof Uint8Array !== "undefined") && (typeof Uint16Array !== "undefined") && (typeof Uint32Array !== "undefined");
+
+var pako = require("pako");
+var utils = require("./utils");
+var GenericWorker = require("./stream/GenericWorker");
+
+var ARRAY_TYPE = USE_TYPEDARRAY ? "uint8array" : "array";
+
+exports.magic = "\x08\x00";
+
+/**
+ * Create a worker that uses pako to inflate/deflate.
+ * @constructor
+ * @param {String} action the name of the pako function to call : either "Deflate" or "Inflate".
+ * @param {Object} options the options to use when (de)compressing.
+ */
+function FlateWorker(action, options) {
+ GenericWorker.call(this, "FlateWorker/" + action);
+
+ this._pako = null;
+ this._pakoAction = action;
+ this._pakoOptions = options;
+ // the `meta` object from the last chunk received
+ // this allow this worker to pass around metadata
+ this.meta = {};
+}
+
+utils.inherits(FlateWorker, GenericWorker);
+
+/**
+ * @see GenericWorker.processChunk
+ */
+FlateWorker.prototype.processChunk = function (chunk) {
+ this.meta = chunk.meta;
+ if (this._pako === null) {
+ this._createPako();
+ }
+ this._pako.push(utils.transformTo(ARRAY_TYPE, chunk.data), false);
+};
+
+/**
+ * @see GenericWorker.flush
+ */
+FlateWorker.prototype.flush = function () {
+ GenericWorker.prototype.flush.call(this);
+ if (this._pako === null) {
+ this._createPako();
+ }
+ this._pako.push([], true);
+};
+/**
+ * @see GenericWorker.cleanUp
+ */
+FlateWorker.prototype.cleanUp = function () {
+ GenericWorker.prototype.cleanUp.call(this);
+ this._pako = null;
+};
+
+/**
+ * Create the _pako object.
+ * TODO: lazy-loading this object isn't the best solution but it's the
+ * quickest. The best solution is to lazy-load the worker list. See also the
+ * issue #446.
+ */
+FlateWorker.prototype._createPako = function () {
+ this._pako = new pako[this._pakoAction]({
+ raw: true,
+ level: this._pakoOptions.level || -1 // default compression
+ });
+ var self = this;
+ this._pako.onData = function(data) {
+ self.push({
+ data : data,
+ meta : self.meta
+ });
+ };
+};
+
+exports.compressWorker = function (compressionOptions) {
+ return new FlateWorker("Deflate", compressionOptions);
+};
+exports.uncompressWorker = function () {
+ return new FlateWorker("Inflate", {});
+};
+
+},{"./stream/GenericWorker":28,"./utils":32,"pako":38}],8:[function(require,module,exports){
+"use strict";
+
+var utils = require("../utils");
+var GenericWorker = require("../stream/GenericWorker");
+var utf8 = require("../utf8");
+var crc32 = require("../crc32");
+var signature = require("../signature");
+
+/**
+ * Transform an integer into a string in hexadecimal.
+ * @private
+ * @param {number} dec the number to convert.
+ * @param {number} bytes the number of bytes to generate.
+ * @returns {string} the result.
+ */
+var decToHex = function(dec, bytes) {
+ var hex = "", i;
+ for (i = 0; i < bytes; i++) {
+ hex += String.fromCharCode(dec & 0xff);
+ dec = dec >>> 8;
+ }
+ return hex;
+};
+
+/**
+ * Generate the UNIX part of the external file attributes.
+ * @param {Object} unixPermissions the unix permissions or null.
+ * @param {Boolean} isDir true if the entry is a directory, false otherwise.
+ * @return {Number} a 32 bit integer.
+ *
+ * adapted from http://unix.stackexchange.com/questions/14705/the-zip-formats-external-file-attribute :
+ *
+ * TTTTsstrwxrwxrwx0000000000ADVSHR
+ * ^^^^____________________________ file type, see zipinfo.c (UNX_*)
+ * ^^^_________________________ setuid, setgid, sticky
+ * ^^^^^^^^^________________ permissions
+ * ^^^^^^^^^^______ not used ?
+ * ^^^^^^ DOS attribute bits : Archive, Directory, Volume label, System file, Hidden, Read only
+ */
+var generateUnixExternalFileAttr = function (unixPermissions, isDir) {
+
+ var result = unixPermissions;
+ if (!unixPermissions) {
+ // I can't use octal values in strict mode, hence the hexa.
+ // 040775 => 0x41fd
+ // 0100664 => 0x81b4
+ result = isDir ? 0x41fd : 0x81b4;
+ }
+ return (result & 0xFFFF) << 16;
+};
+
+/**
+ * Generate the DOS part of the external file attributes.
+ * @param {Object} dosPermissions the dos permissions or null.
+ * @param {Boolean} isDir true if the entry is a directory, false otherwise.
+ * @return {Number} a 32 bit integer.
+ *
+ * Bit 0 Read-Only
+ * Bit 1 Hidden
+ * Bit 2 System
+ * Bit 3 Volume Label
+ * Bit 4 Directory
+ * Bit 5 Archive
+ */
+var generateDosExternalFileAttr = function (dosPermissions) {
+ // the dir flag is already set for compatibility
+ return (dosPermissions || 0) & 0x3F;
+};
+
+/**
+ * Generate the various parts used in the construction of the final zip file.
+ * @param {Object} streamInfo the hash with information about the compressed file.
+ * @param {Boolean} streamedContent is the content streamed ?
+ * @param {Boolean} streamingEnded is the stream finished ?
+ * @param {number} offset the current offset from the start of the zip file.
+ * @param {String} platform let's pretend we are this platform (change platform dependents fields)
+ * @param {Function} encodeFileName the function to encode the file name / comment.
+ * @return {Object} the zip parts.
+ */
+var generateZipParts = function(streamInfo, streamedContent, streamingEnded, offset, platform, encodeFileName) {
+ var file = streamInfo["file"],
+ compression = streamInfo["compression"],
+ useCustomEncoding = encodeFileName !== utf8.utf8encode,
+ encodedFileName = utils.transformTo("string", encodeFileName(file.name)),
+ utfEncodedFileName = utils.transformTo("string", utf8.utf8encode(file.name)),
+ comment = file.comment,
+ encodedComment = utils.transformTo("string", encodeFileName(comment)),
+ utfEncodedComment = utils.transformTo("string", utf8.utf8encode(comment)),
+ useUTF8ForFileName = utfEncodedFileName.length !== file.name.length,
+ useUTF8ForComment = utfEncodedComment.length !== comment.length,
+ dosTime,
+ dosDate,
+ extraFields = "",
+ unicodePathExtraField = "",
+ unicodeCommentExtraField = "",
+ dir = file.dir,
+ date = file.date;
+
+
+ var dataInfo = {
+ crc32 : 0,
+ compressedSize : 0,
+ uncompressedSize : 0
+ };
+
+ // if the content is streamed, the sizes/crc32 are only available AFTER
+ // the end of the stream.
+ if (!streamedContent || streamingEnded) {
+ dataInfo.crc32 = streamInfo["crc32"];
+ dataInfo.compressedSize = streamInfo["compressedSize"];
+ dataInfo.uncompressedSize = streamInfo["uncompressedSize"];
+ }
+
+ var bitflag = 0;
+ if (streamedContent) {
+ // Bit 3: the sizes/crc32 are set to zero in the local header.
+ // The correct values are put in the data descriptor immediately
+ // following the compressed data.
+ bitflag |= 0x0008;
+ }
+ if (!useCustomEncoding && (useUTF8ForFileName || useUTF8ForComment)) {
+ // Bit 11: Language encoding flag (EFS).
+ bitflag |= 0x0800;
+ }
+
+
+ var extFileAttr = 0;
+ var versionMadeBy = 0;
+ if (dir) {
+ // dos or unix, we set the dos dir flag
+ extFileAttr |= 0x00010;
+ }
+ if(platform === "UNIX") {
+ versionMadeBy = 0x031E; // UNIX, version 3.0
+ extFileAttr |= generateUnixExternalFileAttr(file.unixPermissions, dir);
+ } else { // DOS or other, fallback to DOS
+ versionMadeBy = 0x0014; // DOS, version 2.0
+ extFileAttr |= generateDosExternalFileAttr(file.dosPermissions, dir);
+ }
+
+ // date
+ // @see http://www.delorie.com/djgpp/doc/rbinter/it/52/13.html
+ // @see http://www.delorie.com/djgpp/doc/rbinter/it/65/16.html
+ // @see http://www.delorie.com/djgpp/doc/rbinter/it/66/16.html
+
+ dosTime = date.getUTCHours();
+ dosTime = dosTime << 6;
+ dosTime = dosTime | date.getUTCMinutes();
+ dosTime = dosTime << 5;
+ dosTime = dosTime | date.getUTCSeconds() / 2;
+
+ dosDate = date.getUTCFullYear() - 1980;
+ dosDate = dosDate << 4;
+ dosDate = dosDate | (date.getUTCMonth() + 1);
+ dosDate = dosDate << 5;
+ dosDate = dosDate | date.getUTCDate();
+
+ if (useUTF8ForFileName) {
+ // set the unicode path extra field. unzip needs at least one extra
+ // field to correctly handle unicode path, so using the path is as good
+ // as any other information. This could improve the situation with
+ // other archive managers too.
+ // This field is usually used without the utf8 flag, with a non
+ // unicode path in the header (winrar, winzip). This helps (a bit)
+ // with the messy Windows' default compressed folders feature but
+ // breaks on p7zip which doesn't seek the unicode path extra field.
+ // So for now, UTF-8 everywhere !
+ unicodePathExtraField =
+ // Version
+ decToHex(1, 1) +
+ // NameCRC32
+ decToHex(crc32(encodedFileName), 4) +
+ // UnicodeName
+ utfEncodedFileName;
+
+ extraFields +=
+ // Info-ZIP Unicode Path Extra Field
+ "\x75\x70" +
+ // size
+ decToHex(unicodePathExtraField.length, 2) +
+ // content
+ unicodePathExtraField;
+ }
+
+ if(useUTF8ForComment) {
+
+ unicodeCommentExtraField =
+ // Version
+ decToHex(1, 1) +
+ // CommentCRC32
+ decToHex(crc32(encodedComment), 4) +
+ // UnicodeName
+ utfEncodedComment;
+
+ extraFields +=
+ // Info-ZIP Unicode Path Extra Field
+ "\x75\x63" +
+ // size
+ decToHex(unicodeCommentExtraField.length, 2) +
+ // content
+ unicodeCommentExtraField;
+ }
+
+ var header = "";
+
+ // version needed to extract
+ header += "\x0A\x00";
+ // general purpose bit flag
+ header += decToHex(bitflag, 2);
+ // compression method
+ header += compression.magic;
+ // last mod file time
+ header += decToHex(dosTime, 2);
+ // last mod file date
+ header += decToHex(dosDate, 2);
+ // crc-32
+ header += decToHex(dataInfo.crc32, 4);
+ // compressed size
+ header += decToHex(dataInfo.compressedSize, 4);
+ // uncompressed size
+ header += decToHex(dataInfo.uncompressedSize, 4);
+ // file name length
+ header += decToHex(encodedFileName.length, 2);
+ // extra field length
+ header += decToHex(extraFields.length, 2);
+
+
+ var fileRecord = signature.LOCAL_FILE_HEADER + header + encodedFileName + extraFields;
+
+ var dirRecord = signature.CENTRAL_FILE_HEADER +
+ // version made by (00: DOS)
+ decToHex(versionMadeBy, 2) +
+ // file header (common to file and central directory)
+ header +
+ // file comment length
+ decToHex(encodedComment.length, 2) +
+ // disk number start
+ "\x00\x00" +
+ // internal file attributes TODO
+ "\x00\x00" +
+ // external file attributes
+ decToHex(extFileAttr, 4) +
+ // relative offset of local header
+ decToHex(offset, 4) +
+ // file name
+ encodedFileName +
+ // extra field
+ extraFields +
+ // file comment
+ encodedComment;
+
+ return {
+ fileRecord: fileRecord,
+ dirRecord: dirRecord
+ };
+};
+
+/**
+ * Generate the EOCD record.
+ * @param {Number} entriesCount the number of entries in the zip file.
+ * @param {Number} centralDirLength the length (in bytes) of the central dir.
+ * @param {Number} localDirLength the length (in bytes) of the local dir.
+ * @param {String} comment the zip file comment as a binary string.
+ * @param {Function} encodeFileName the function to encode the comment.
+ * @return {String} the EOCD record.
+ */
+var generateCentralDirectoryEnd = function (entriesCount, centralDirLength, localDirLength, comment, encodeFileName) {
+ var dirEnd = "";
+ var encodedComment = utils.transformTo("string", encodeFileName(comment));
+
+ // end of central dir signature
+ dirEnd = signature.CENTRAL_DIRECTORY_END +
+ // number of this disk
+ "\x00\x00" +
+ // number of the disk with the start of the central directory
+ "\x00\x00" +
+ // total number of entries in the central directory on this disk
+ decToHex(entriesCount, 2) +
+ // total number of entries in the central directory
+ decToHex(entriesCount, 2) +
+ // size of the central directory 4 bytes
+ decToHex(centralDirLength, 4) +
+ // offset of start of central directory with respect to the starting disk number
+ decToHex(localDirLength, 4) +
+ // .ZIP file comment length
+ decToHex(encodedComment.length, 2) +
+ // .ZIP file comment
+ encodedComment;
+
+ return dirEnd;
+};
+
+/**
+ * Generate data descriptors for a file entry.
+ * @param {Object} streamInfo the hash generated by a worker, containing information
+ * on the file entry.
+ * @return {String} the data descriptors.
+ */
+var generateDataDescriptors = function (streamInfo) {
+ var descriptor = "";
+ descriptor = signature.DATA_DESCRIPTOR +
+ // crc-32 4 bytes
+ decToHex(streamInfo["crc32"], 4) +
+ // compressed size 4 bytes
+ decToHex(streamInfo["compressedSize"], 4) +
+ // uncompressed size 4 bytes
+ decToHex(streamInfo["uncompressedSize"], 4);
+
+ return descriptor;
+};
+
+
+/**
+ * A worker to concatenate other workers to create a zip file.
+ * @param {Boolean} streamFiles `true` to stream the content of the files,
+ * `false` to accumulate it.
+ * @param {String} comment the comment to use.
+ * @param {String} platform the platform to use, "UNIX" or "DOS".
+ * @param {Function} encodeFileName the function to encode file names and comments.
+ */
+function ZipFileWorker(streamFiles, comment, platform, encodeFileName) {
+ GenericWorker.call(this, "ZipFileWorker");
+ // The number of bytes written so far. This doesn't count accumulated chunks.
+ this.bytesWritten = 0;
+ // The comment of the zip file
+ this.zipComment = comment;
+ // The platform "generating" the zip file.
+ this.zipPlatform = platform;
+ // the function to encode file names and comments.
+ this.encodeFileName = encodeFileName;
+ // Should we stream the content of the files ?
+ this.streamFiles = streamFiles;
+ // If `streamFiles` is false, we will need to accumulate the content of the
+ // files to calculate sizes / crc32 (and write them *before* the content).
+ // This boolean indicates if we are accumulating chunks (it will change a lot
+ // during the lifetime of this worker).
+ this.accumulate = false;
+ // The buffer receiving chunks when accumulating content.
+ this.contentBuffer = [];
+ // The list of generated directory records.
+ this.dirRecords = [];
+ // The offset (in bytes) from the beginning of the zip file for the current source.
+ this.currentSourceOffset = 0;
+ // The total number of entries in this zip file.
+ this.entriesCount = 0;
+ // the name of the file currently being added, null when handling the end of the zip file.
+ // Used for the emitted metadata.
+ this.currentFile = null;
+
+
+
+ this._sources = [];
+}
+utils.inherits(ZipFileWorker, GenericWorker);
+
+/**
+ * @see GenericWorker.push
+ */
+ZipFileWorker.prototype.push = function (chunk) {
+
+ var currentFilePercent = chunk.meta.percent || 0;
+ var entriesCount = this.entriesCount;
+ var remainingFiles = this._sources.length;
+
+ if(this.accumulate) {
+ this.contentBuffer.push(chunk);
+ } else {
+ this.bytesWritten += chunk.data.length;
+
+ GenericWorker.prototype.push.call(this, {
+ data : chunk.data,
+ meta : {
+ currentFile : this.currentFile,
+ percent : entriesCount ? (currentFilePercent + 100 * (entriesCount - remainingFiles - 1)) / entriesCount : 100
+ }
+ });
+ }
+};
+
+/**
+ * The worker started a new source (an other worker).
+ * @param {Object} streamInfo the streamInfo object from the new source.
+ */
+ZipFileWorker.prototype.openedSource = function (streamInfo) {
+ this.currentSourceOffset = this.bytesWritten;
+ this.currentFile = streamInfo["file"].name;
+
+ var streamedContent = this.streamFiles && !streamInfo["file"].dir;
+
+ // don't stream folders (because they don't have any content)
+ if(streamedContent) {
+ var record = generateZipParts(streamInfo, streamedContent, false, this.currentSourceOffset, this.zipPlatform, this.encodeFileName);
+ this.push({
+ data : record.fileRecord,
+ meta : {percent:0}
+ });
+ } else {
+ // we need to wait for the whole file before pushing anything
+ this.accumulate = true;
+ }
+};
+
+/**
+ * The worker finished a source (an other worker).
+ * @param {Object} streamInfo the streamInfo object from the finished source.
+ */
+ZipFileWorker.prototype.closedSource = function (streamInfo) {
+ this.accumulate = false;
+ var streamedContent = this.streamFiles && !streamInfo["file"].dir;
+ var record = generateZipParts(streamInfo, streamedContent, true, this.currentSourceOffset, this.zipPlatform, this.encodeFileName);
+
+ this.dirRecords.push(record.dirRecord);
+ if(streamedContent) {
+ // after the streamed file, we put data descriptors
+ this.push({
+ data : generateDataDescriptors(streamInfo),
+ meta : {percent:100}
+ });
+ } else {
+ // the content wasn't streamed, we need to push everything now
+ // first the file record, then the content
+ this.push({
+ data : record.fileRecord,
+ meta : {percent:0}
+ });
+ while(this.contentBuffer.length) {
+ this.push(this.contentBuffer.shift());
+ }
+ }
+ this.currentFile = null;
+};
+
+/**
+ * @see GenericWorker.flush
+ */
+ZipFileWorker.prototype.flush = function () {
+
+ var localDirLength = this.bytesWritten;
+ for(var i = 0; i < this.dirRecords.length; i++) {
+ this.push({
+ data : this.dirRecords[i],
+ meta : {percent:100}
+ });
+ }
+ var centralDirLength = this.bytesWritten - localDirLength;
+
+ var dirEnd = generateCentralDirectoryEnd(this.dirRecords.length, centralDirLength, localDirLength, this.zipComment, this.encodeFileName);
+
+ this.push({
+ data : dirEnd,
+ meta : {percent:100}
+ });
+};
+
+/**
+ * Prepare the next source to be read.
+ */
+ZipFileWorker.prototype.prepareNextSource = function () {
+ this.previous = this._sources.shift();
+ this.openedSource(this.previous.streamInfo);
+ if (this.isPaused) {
+ this.previous.pause();
+ } else {
+ this.previous.resume();
+ }
+};
+
+/**
+ * @see GenericWorker.registerPrevious
+ */
+ZipFileWorker.prototype.registerPrevious = function (previous) {
+ this._sources.push(previous);
+ var self = this;
+
+ previous.on("data", function (chunk) {
+ self.processChunk(chunk);
+ });
+ previous.on("end", function () {
+ self.closedSource(self.previous.streamInfo);
+ if(self._sources.length) {
+ self.prepareNextSource();
+ } else {
+ self.end();
+ }
+ });
+ previous.on("error", function (e) {
+ self.error(e);
+ });
+ return this;
+};
+
+/**
+ * @see GenericWorker.resume
+ */
+ZipFileWorker.prototype.resume = function () {
+ if(!GenericWorker.prototype.resume.call(this)) {
+ return false;
+ }
+
+ if (!this.previous && this._sources.length) {
+ this.prepareNextSource();
+ return true;
+ }
+ if (!this.previous && !this._sources.length && !this.generatedError) {
+ this.end();
+ return true;
+ }
+};
+
+/**
+ * @see GenericWorker.error
+ */
+ZipFileWorker.prototype.error = function (e) {
+ var sources = this._sources;
+ if(!GenericWorker.prototype.error.call(this, e)) {
+ return false;
+ }
+ for(var i = 0; i < sources.length; i++) {
+ try {
+ sources[i].error(e);
+ } catch(e) {
+ // the `error` exploded, nothing to do
+ }
+ }
+ return true;
+};
+
+/**
+ * @see GenericWorker.lock
+ */
+ZipFileWorker.prototype.lock = function () {
+ GenericWorker.prototype.lock.call(this);
+ var sources = this._sources;
+ for(var i = 0; i < sources.length; i++) {
+ sources[i].lock();
+ }
+};
+
+module.exports = ZipFileWorker;
+
+},{"../crc32":4,"../signature":23,"../stream/GenericWorker":28,"../utf8":31,"../utils":32}],9:[function(require,module,exports){
+"use strict";
+
+var compressions = require("../compressions");
+var ZipFileWorker = require("./ZipFileWorker");
+
+/**
+ * Find the compression to use.
+ * @param {String} fileCompression the compression defined at the file level, if any.
+ * @param {String} zipCompression the compression defined at the load() level.
+ * @return {Object} the compression object to use.
+ */
+var getCompression = function (fileCompression, zipCompression) {
+
+ var compressionName = fileCompression || zipCompression;
+ var compression = compressions[compressionName];
+ if (!compression) {
+ throw new Error(compressionName + " is not a valid compression method !");
+ }
+ return compression;
+};
+
+/**
+ * Create a worker to generate a zip file.
+ * @param {JSZip} zip the JSZip instance at the right root level.
+ * @param {Object} options to generate the zip file.
+ * @param {String} comment the comment to use.
+ */
+exports.generateWorker = function (zip, options, comment) {
+
+ var zipFileWorker = new ZipFileWorker(options.streamFiles, comment, options.platform, options.encodeFileName);
+ var entriesCount = 0;
+ try {
+
+ zip.forEach(function (relativePath, file) {
+ entriesCount++;
+ var compression = getCompression(file.options.compression, options.compression);
+ var compressionOptions = file.options.compressionOptions || options.compressionOptions || {};
+ var dir = file.dir, date = file.date;
+
+ file._compressWorker(compression, compressionOptions)
+ .withStreamInfo("file", {
+ name : relativePath,
+ dir : dir,
+ date : date,
+ comment : file.comment || "",
+ unixPermissions : file.unixPermissions,
+ dosPermissions : file.dosPermissions
+ })
+ .pipe(zipFileWorker);
+ });
+ zipFileWorker.entriesCount = entriesCount;
+ } catch (e) {
+ zipFileWorker.error(e);
+ }
+
+ return zipFileWorker;
+};
+
+},{"../compressions":3,"./ZipFileWorker":8}],10:[function(require,module,exports){
+"use strict";
+
+/**
+ * Representation a of zip file in js
+ * @constructor
+ */
+function JSZip() {
+ // if this constructor is used without `new`, it adds `new` before itself:
+ if(!(this instanceof JSZip)) {
+ return new JSZip();
+ }
+
+ if(arguments.length) {
+ throw new Error("The constructor with parameters has been removed in JSZip 3.0, please check the upgrade guide.");
+ }
+
+ // object containing the files :
+ // {
+ // "folder/" : {...},
+ // "folder/data.txt" : {...}
+ // }
+ // NOTE: we use a null prototype because we do not
+ // want filenames like "toString" coming from a zip file
+ // to overwrite methods and attributes in a normal Object.
+ this.files = Object.create(null);
+
+ this.comment = null;
+
+ // Where we are in the hierarchy
+ this.root = "";
+ this.clone = function() {
+ var newObj = new JSZip();
+ for (var i in this) {
+ if (typeof this[i] !== "function") {
+ newObj[i] = this[i];
+ }
+ }
+ return newObj;
+ };
+}
+JSZip.prototype = require("./object");
+JSZip.prototype.loadAsync = require("./load");
+JSZip.support = require("./support");
+JSZip.defaults = require("./defaults");
+
+// TODO find a better way to handle this version,
+// a require('package.json').version doesn't work with webpack, see #327
+JSZip.version = "3.10.1";
+
+JSZip.loadAsync = function (content, options) {
+ return new JSZip().loadAsync(content, options);
+};
+
+JSZip.external = require("./external");
+module.exports = JSZip;
+
+},{"./defaults":5,"./external":6,"./load":11,"./object":15,"./support":30}],11:[function(require,module,exports){
+"use strict";
+var utils = require("./utils");
+var external = require("./external");
+var utf8 = require("./utf8");
+var ZipEntries = require("./zipEntries");
+var Crc32Probe = require("./stream/Crc32Probe");
+var nodejsUtils = require("./nodejsUtils");
+
+/**
+ * Check the CRC32 of an entry.
+ * @param {ZipEntry} zipEntry the zip entry to check.
+ * @return {Promise} the result.
+ */
+function checkEntryCRC32(zipEntry) {
+ return new external.Promise(function (resolve, reject) {
+ var worker = zipEntry.decompressed.getContentWorker().pipe(new Crc32Probe());
+ worker.on("error", function (e) {
+ reject(e);
+ })
+ .on("end", function () {
+ if (worker.streamInfo.crc32 !== zipEntry.decompressed.crc32) {
+ reject(new Error("Corrupted zip : CRC32 mismatch"));
+ } else {
+ resolve();
+ }
+ })
+ .resume();
+ });
+}
+
+module.exports = function (data, options) {
+ var zip = this;
+ options = utils.extend(options || {}, {
+ base64: false,
+ checkCRC32: false,
+ optimizedBinaryString: false,
+ createFolders: false,
+ decodeFileName: utf8.utf8decode
+ });
+
+ if (nodejsUtils.isNode && nodejsUtils.isStream(data)) {
+ return external.Promise.reject(new Error("JSZip can't accept a stream when loading a zip file."));
+ }
+
+ return utils.prepareContent("the loaded zip file", data, true, options.optimizedBinaryString, options.base64)
+ .then(function (data) {
+ var zipEntries = new ZipEntries(options);
+ zipEntries.load(data);
+ return zipEntries;
+ }).then(function checkCRC32(zipEntries) {
+ var promises = [external.Promise.resolve(zipEntries)];
+ var files = zipEntries.files;
+ if (options.checkCRC32) {
+ for (var i = 0; i < files.length; i++) {
+ promises.push(checkEntryCRC32(files[i]));
+ }
+ }
+ return external.Promise.all(promises);
+ }).then(function addFiles(results) {
+ var zipEntries = results.shift();
+ var files = zipEntries.files;
+ for (var i = 0; i < files.length; i++) {
+ var input = files[i];
+
+ var unsafeName = input.fileNameStr;
+ var safeName = utils.resolve(input.fileNameStr);
+
+ zip.file(safeName, input.decompressed, {
+ binary: true,
+ optimizedBinaryString: true,
+ date: input.date,
+ dir: input.dir,
+ comment: input.fileCommentStr.length ? input.fileCommentStr : null,
+ unixPermissions: input.unixPermissions,
+ dosPermissions: input.dosPermissions,
+ createFolders: options.createFolders
+ });
+ if (!input.dir) {
+ zip.file(safeName).unsafeOriginalName = unsafeName;
+ }
+ }
+ if (zipEntries.zipComment.length) {
+ zip.comment = zipEntries.zipComment;
+ }
+
+ return zip;
+ });
+};
+
+},{"./external":6,"./nodejsUtils":14,"./stream/Crc32Probe":25,"./utf8":31,"./utils":32,"./zipEntries":33}],12:[function(require,module,exports){
+"use strict";
+
+var utils = require("../utils");
+var GenericWorker = require("../stream/GenericWorker");
+
+/**
+ * A worker that use a nodejs stream as source.
+ * @constructor
+ * @param {String} filename the name of the file entry for this stream.
+ * @param {Readable} stream the nodejs stream.
+ */
+function NodejsStreamInputAdapter(filename, stream) {
+ GenericWorker.call(this, "Nodejs stream input adapter for " + filename);
+ this._upstreamEnded = false;
+ this._bindStream(stream);
+}
+
+utils.inherits(NodejsStreamInputAdapter, GenericWorker);
+
+/**
+ * Prepare the stream and bind the callbacks on it.
+ * Do this ASAP on node 0.10 ! A lazy binding doesn't always work.
+ * @param {Stream} stream the nodejs stream to use.
+ */
+NodejsStreamInputAdapter.prototype._bindStream = function (stream) {
+ var self = this;
+ this._stream = stream;
+ stream.pause();
+ stream
+ .on("data", function (chunk) {
+ self.push({
+ data: chunk,
+ meta : {
+ percent : 0
+ }
+ });
+ })
+ .on("error", function (e) {
+ if(self.isPaused) {
+ this.generatedError = e;
+ } else {
+ self.error(e);
+ }
+ })
+ .on("end", function () {
+ if(self.isPaused) {
+ self._upstreamEnded = true;
+ } else {
+ self.end();
+ }
+ });
+};
+NodejsStreamInputAdapter.prototype.pause = function () {
+ if(!GenericWorker.prototype.pause.call(this)) {
+ return false;
+ }
+ this._stream.pause();
+ return true;
+};
+NodejsStreamInputAdapter.prototype.resume = function () {
+ if(!GenericWorker.prototype.resume.call(this)) {
+ return false;
+ }
+
+ if(this._upstreamEnded) {
+ this.end();
+ } else {
+ this._stream.resume();
+ }
+
+ return true;
+};
+
+module.exports = NodejsStreamInputAdapter;
+
+},{"../stream/GenericWorker":28,"../utils":32}],13:[function(require,module,exports){
+"use strict";
+
+var Readable = require("readable-stream").Readable;
+
+var utils = require("../utils");
+utils.inherits(NodejsStreamOutputAdapter, Readable);
+
+/**
+* A nodejs stream using a worker as source.
+* @see the SourceWrapper in http://nodejs.org/api/stream.html
+* @constructor
+* @param {StreamHelper} helper the helper wrapping the worker
+* @param {Object} options the nodejs stream options
+* @param {Function} updateCb the update callback.
+*/
+function NodejsStreamOutputAdapter(helper, options, updateCb) {
+ Readable.call(this, options);
+ this._helper = helper;
+
+ var self = this;
+ helper.on("data", function (data, meta) {
+ if (!self.push(data)) {
+ self._helper.pause();
+ }
+ if(updateCb) {
+ updateCb(meta);
+ }
+ })
+ .on("error", function(e) {
+ self.emit("error", e);
+ })
+ .on("end", function () {
+ self.push(null);
+ });
+}
+
+
+NodejsStreamOutputAdapter.prototype._read = function() {
+ this._helper.resume();
+};
+
+module.exports = NodejsStreamOutputAdapter;
+
+},{"../utils":32,"readable-stream":16}],14:[function(require,module,exports){
+"use strict";
+
+module.exports = {
+ /**
+ * True if this is running in Nodejs, will be undefined in a browser.
+ * In a browser, browserify won't include this file and the whole module
+ * will be resolved an empty object.
+ */
+ isNode : typeof Buffer !== "undefined",
+ /**
+ * Create a new nodejs Buffer from an existing content.
+ * @param {Object} data the data to pass to the constructor.
+ * @param {String} encoding the encoding to use.
+ * @return {Buffer} a new Buffer.
+ */
+ newBufferFrom: function(data, encoding) {
+ if (Buffer.from && Buffer.from !== Uint8Array.from) {
+ return Buffer.from(data, encoding);
+ } else {
+ if (typeof data === "number") {
+ // Safeguard for old Node.js versions. On newer versions,
+ // Buffer.from(number) / Buffer(number, encoding) already throw.
+ throw new Error("The \"data\" argument must not be a number");
+ }
+ return new Buffer(data, encoding);
+ }
+ },
+ /**
+ * Create a new nodejs Buffer with the specified size.
+ * @param {Integer} size the size of the buffer.
+ * @return {Buffer} a new Buffer.
+ */
+ allocBuffer: function (size) {
+ if (Buffer.alloc) {
+ return Buffer.alloc(size);
+ } else {
+ var buf = new Buffer(size);
+ buf.fill(0);
+ return buf;
+ }
+ },
+ /**
+ * Find out if an object is a Buffer.
+ * @param {Object} b the object to test.
+ * @return {Boolean} true if the object is a Buffer, false otherwise.
+ */
+ isBuffer : function(b){
+ return Buffer.isBuffer(b);
+ },
+
+ isStream : function (obj) {
+ return obj &&
+ typeof obj.on === "function" &&
+ typeof obj.pause === "function" &&
+ typeof obj.resume === "function";
+ }
+};
+
+},{}],15:[function(require,module,exports){
+"use strict";
+var utf8 = require("./utf8");
+var utils = require("./utils");
+var GenericWorker = require("./stream/GenericWorker");
+var StreamHelper = require("./stream/StreamHelper");
+var defaults = require("./defaults");
+var CompressedObject = require("./compressedObject");
+var ZipObject = require("./zipObject");
+var generate = require("./generate");
+var nodejsUtils = require("./nodejsUtils");
+var NodejsStreamInputAdapter = require("./nodejs/NodejsStreamInputAdapter");
+
+
+/**
+ * Add a file in the current folder.
+ * @private
+ * @param {string} name the name of the file
+ * @param {String|ArrayBuffer|Uint8Array|Buffer} data the data of the file
+ * @param {Object} originalOptions the options of the file
+ * @return {Object} the new file.
+ */
+var fileAdd = function(name, data, originalOptions) {
+ // be sure sub folders exist
+ var dataType = utils.getTypeOf(data),
+ parent;
+
+
+ /*
+ * Correct options.
+ */
+
+ var o = utils.extend(originalOptions || {}, defaults);
+ o.date = o.date || new Date();
+ if (o.compression !== null) {
+ o.compression = o.compression.toUpperCase();
+ }
+
+ if (typeof o.unixPermissions === "string") {
+ o.unixPermissions = parseInt(o.unixPermissions, 8);
+ }
+
+ // UNX_IFDIR 0040000 see zipinfo.c
+ if (o.unixPermissions && (o.unixPermissions & 0x4000)) {
+ o.dir = true;
+ }
+ // Bit 4 Directory
+ if (o.dosPermissions && (o.dosPermissions & 0x0010)) {
+ o.dir = true;
+ }
+
+ if (o.dir) {
+ name = forceTrailingSlash(name);
+ }
+ if (o.createFolders && (parent = parentFolder(name))) {
+ folderAdd.call(this, parent, true);
+ }
+
+ var isUnicodeString = dataType === "string" && o.binary === false && o.base64 === false;
+ if (!originalOptions || typeof originalOptions.binary === "undefined") {
+ o.binary = !isUnicodeString;
+ }
+
+
+ var isCompressedEmpty = (data instanceof CompressedObject) && data.uncompressedSize === 0;
+
+ if (isCompressedEmpty || o.dir || !data || data.length === 0) {
+ o.base64 = false;
+ o.binary = true;
+ data = "";
+ o.compression = "STORE";
+ dataType = "string";
+ }
+
+ /*
+ * Convert content to fit.
+ */
+
+ var zipObjectContent = null;
+ if (data instanceof CompressedObject || data instanceof GenericWorker) {
+ zipObjectContent = data;
+ } else if (nodejsUtils.isNode && nodejsUtils.isStream(data)) {
+ zipObjectContent = new NodejsStreamInputAdapter(name, data);
+ } else {
+ zipObjectContent = utils.prepareContent(name, data, o.binary, o.optimizedBinaryString, o.base64);
+ }
+
+ var object = new ZipObject(name, zipObjectContent, o);
+ this.files[name] = object;
+ /*
+ TODO: we can't throw an exception because we have async promises
+ (we can have a promise of a Date() for example) but returning a
+ promise is useless because file(name, data) returns the JSZip
+ object for chaining. Should we break that to allow the user
+ to catch the error ?
+
+ return external.Promise.resolve(zipObjectContent)
+ .then(function () {
+ return object;
+ });
+ */
+};
+
+/**
+ * Find the parent folder of the path.
+ * @private
+ * @param {string} path the path to use
+ * @return {string} the parent folder, or ""
+ */
+var parentFolder = function (path) {
+ if (path.slice(-1) === "/") {
+ path = path.substring(0, path.length - 1);
+ }
+ var lastSlash = path.lastIndexOf("/");
+ return (lastSlash > 0) ? path.substring(0, lastSlash) : "";
+};
+
+/**
+ * Returns the path with a slash at the end.
+ * @private
+ * @param {String} path the path to check.
+ * @return {String} the path with a trailing slash.
+ */
+var forceTrailingSlash = function(path) {
+ // Check the name ends with a /
+ if (path.slice(-1) !== "/") {
+ path += "/"; // IE doesn't like substr(-1)
+ }
+ return path;
+};
+
+/**
+ * Add a (sub) folder in the current folder.
+ * @private
+ * @param {string} name the folder's name
+ * @param {boolean=} [createFolders] If true, automatically create sub
+ * folders. Defaults to false.
+ * @return {Object} the new folder.
+ */
+var folderAdd = function(name, createFolders) {
+ createFolders = (typeof createFolders !== "undefined") ? createFolders : defaults.createFolders;
+
+ name = forceTrailingSlash(name);
+
+ // Does this folder already exist?
+ if (!this.files[name]) {
+ fileAdd.call(this, name, null, {
+ dir: true,
+ createFolders: createFolders
+ });
+ }
+ return this.files[name];
+};
+
+/**
+* Cross-window, cross-Node-context regular expression detection
+* @param {Object} object Anything
+* @return {Boolean} true if the object is a regular expression,
+* false otherwise
+*/
+function isRegExp(object) {
+ return Object.prototype.toString.call(object) === "[object RegExp]";
+}
+
+// return the actual prototype of JSZip
+var out = {
+ /**
+ * @see loadAsync
+ */
+ load: function() {
+ throw new Error("This method has been removed in JSZip 3.0, please check the upgrade guide.");
+ },
+
+
+ /**
+ * Call a callback function for each entry at this folder level.
+ * @param {Function} cb the callback function:
+ * function (relativePath, file) {...}
+ * It takes 2 arguments : the relative path and the file.
+ */
+ forEach: function(cb) {
+ var filename, relativePath, file;
+ // ignore warning about unwanted properties because this.files is a null prototype object
+ /* eslint-disable-next-line guard-for-in */
+ for (filename in this.files) {
+ file = this.files[filename];
+ relativePath = filename.slice(this.root.length, filename.length);
+ if (relativePath && filename.slice(0, this.root.length) === this.root) { // the file is in the current root
+ cb(relativePath, file); // TODO reverse the parameters ? need to be clean AND consistent with the filter search fn...
+ }
+ }
+ },
+
+ /**
+ * Filter nested files/folders with the specified function.
+ * @param {Function} search the predicate to use :
+ * function (relativePath, file) {...}
+ * It takes 2 arguments : the relative path and the file.
+ * @return {Array} An array of matching elements.
+ */
+ filter: function(search) {
+ var result = [];
+ this.forEach(function (relativePath, entry) {
+ if (search(relativePath, entry)) { // the file matches the function
+ result.push(entry);
+ }
+
+ });
+ return result;
+ },
+
+ /**
+ * Add a file to the zip file, or search a file.
+ * @param {string|RegExp} name The name of the file to add (if data is defined),
+ * the name of the file to find (if no data) or a regex to match files.
+ * @param {String|ArrayBuffer|Uint8Array|Buffer} data The file data, either raw or base64 encoded
+ * @param {Object} o File options
+ * @return {JSZip|Object|Array} this JSZip object (when adding a file),
+ * a file (when searching by string) or an array of files (when searching by regex).
+ */
+ file: function(name, data, o) {
+ if (arguments.length === 1) {
+ if (isRegExp(name)) {
+ var regexp = name;
+ return this.filter(function(relativePath, file) {
+ return !file.dir && regexp.test(relativePath);
+ });
+ }
+ else { // text
+ var obj = this.files[this.root + name];
+ if (obj && !obj.dir) {
+ return obj;
+ } else {
+ return null;
+ }
+ }
+ }
+ else { // more than one argument : we have data !
+ name = this.root + name;
+ fileAdd.call(this, name, data, o);
+ }
+ return this;
+ },
+
+ /**
+ * Add a directory to the zip file, or search.
+ * @param {String|RegExp} arg The name of the directory to add, or a regex to search folders.
+ * @return {JSZip} an object with the new directory as the root, or an array containing matching folders.
+ */
+ folder: function(arg) {
+ if (!arg) {
+ return this;
+ }
+
+ if (isRegExp(arg)) {
+ return this.filter(function(relativePath, file) {
+ return file.dir && arg.test(relativePath);
+ });
+ }
+
+ // else, name is a new folder
+ var name = this.root + arg;
+ var newFolder = folderAdd.call(this, name);
+
+ // Allow chaining by returning a new object with this folder as the root
+ var ret = this.clone();
+ ret.root = newFolder.name;
+ return ret;
+ },
+
+ /**
+ * Delete a file, or a directory and all sub-files, from the zip
+ * @param {string} name the name of the file to delete
+ * @return {JSZip} this JSZip object
+ */
+ remove: function(name) {
+ name = this.root + name;
+ var file = this.files[name];
+ if (!file) {
+ // Look for any folders
+ if (name.slice(-1) !== "/") {
+ name += "/";
+ }
+ file = this.files[name];
+ }
+
+ if (file && !file.dir) {
+ // file
+ delete this.files[name];
+ } else {
+ // maybe a folder, delete recursively
+ var kids = this.filter(function(relativePath, file) {
+ return file.name.slice(0, name.length) === name;
+ });
+ for (var i = 0; i < kids.length; i++) {
+ delete this.files[kids[i].name];
+ }
+ }
+
+ return this;
+ },
+
+ /**
+ * @deprecated This method has been removed in JSZip 3.0, please check the upgrade guide.
+ */
+ generate: function() {
+ throw new Error("This method has been removed in JSZip 3.0, please check the upgrade guide.");
+ },
+
+ /**
+ * Generate the complete zip file as an internal stream.
+ * @param {Object} options the options to generate the zip file :
+ * - compression, "STORE" by default.
+ * - type, "base64" by default. Values are : string, base64, uint8array, arraybuffer, blob.
+ * @return {StreamHelper} the streamed zip file.
+ */
+ generateInternalStream: function(options) {
+ var worker, opts = {};
+ try {
+ opts = utils.extend(options || {}, {
+ streamFiles: false,
+ compression: "STORE",
+ compressionOptions : null,
+ type: "",
+ platform: "DOS",
+ comment: null,
+ mimeType: "application/zip",
+ encodeFileName: utf8.utf8encode
+ });
+
+ opts.type = opts.type.toLowerCase();
+ opts.compression = opts.compression.toUpperCase();
+
+ // "binarystring" is preferred but the internals use "string".
+ if(opts.type === "binarystring") {
+ opts.type = "string";
+ }
+
+ if (!opts.type) {
+ throw new Error("No output type specified.");
+ }
+
+ utils.checkSupport(opts.type);
+
+ // accept nodejs `process.platform`
+ if(
+ opts.platform === "darwin" ||
+ opts.platform === "freebsd" ||
+ opts.platform === "linux" ||
+ opts.platform === "sunos"
+ ) {
+ opts.platform = "UNIX";
+ }
+ if (opts.platform === "win32") {
+ opts.platform = "DOS";
+ }
+
+ var comment = opts.comment || this.comment || "";
+ worker = generate.generateWorker(this, opts, comment);
+ } catch (e) {
+ worker = new GenericWorker("error");
+ worker.error(e);
+ }
+ return new StreamHelper(worker, opts.type || "string", opts.mimeType);
+ },
+ /**
+ * Generate the complete zip file asynchronously.
+ * @see generateInternalStream
+ */
+ generateAsync: function(options, onUpdate) {
+ return this.generateInternalStream(options).accumulate(onUpdate);
+ },
+ /**
+ * Generate the complete zip file asynchronously.
+ * @see generateInternalStream
+ */
+ generateNodeStream: function(options, onUpdate) {
+ options = options || {};
+ if (!options.type) {
+ options.type = "nodebuffer";
+ }
+ return this.generateInternalStream(options).toNodejsStream(onUpdate);
+ }
+};
+module.exports = out;
+
+},{"./compressedObject":2,"./defaults":5,"./generate":9,"./nodejs/NodejsStreamInputAdapter":12,"./nodejsUtils":14,"./stream/GenericWorker":28,"./stream/StreamHelper":29,"./utf8":31,"./utils":32,"./zipObject":35}],16:[function(require,module,exports){
+"use strict";
+/*
+ * This file is used by module bundlers (browserify/webpack/etc) when
+ * including a stream implementation. We use "readable-stream" to get a
+ * consistent behavior between nodejs versions but bundlers often have a shim
+ * for "stream". Using this shim greatly improve the compatibility and greatly
+ * reduce the final size of the bundle (only one stream implementation, not
+ * two).
+ */
+module.exports = require("stream");
+
+},{"stream":undefined}],17:[function(require,module,exports){
+"use strict";
+var DataReader = require("./DataReader");
+var utils = require("../utils");
+
+function ArrayReader(data) {
+ DataReader.call(this, data);
+ for(var i = 0; i < this.data.length; i++) {
+ data[i] = data[i] & 0xFF;
+ }
+}
+utils.inherits(ArrayReader, DataReader);
+/**
+ * @see DataReader.byteAt
+ */
+ArrayReader.prototype.byteAt = function(i) {
+ return this.data[this.zero + i];
+};
+/**
+ * @see DataReader.lastIndexOfSignature
+ */
+ArrayReader.prototype.lastIndexOfSignature = function(sig) {
+ var sig0 = sig.charCodeAt(0),
+ sig1 = sig.charCodeAt(1),
+ sig2 = sig.charCodeAt(2),
+ sig3 = sig.charCodeAt(3);
+ for (var i = this.length - 4; i >= 0; --i) {
+ if (this.data[i] === sig0 && this.data[i + 1] === sig1 && this.data[i + 2] === sig2 && this.data[i + 3] === sig3) {
+ return i - this.zero;
+ }
+ }
+
+ return -1;
+};
+/**
+ * @see DataReader.readAndCheckSignature
+ */
+ArrayReader.prototype.readAndCheckSignature = function (sig) {
+ var sig0 = sig.charCodeAt(0),
+ sig1 = sig.charCodeAt(1),
+ sig2 = sig.charCodeAt(2),
+ sig3 = sig.charCodeAt(3),
+ data = this.readData(4);
+ return sig0 === data[0] && sig1 === data[1] && sig2 === data[2] && sig3 === data[3];
+};
+/**
+ * @see DataReader.readData
+ */
+ArrayReader.prototype.readData = function(size) {
+ this.checkOffset(size);
+ if(size === 0) {
+ return [];
+ }
+ var result = this.data.slice(this.zero + this.index, this.zero + this.index + size);
+ this.index += size;
+ return result;
+};
+module.exports = ArrayReader;
+
+},{"../utils":32,"./DataReader":18}],18:[function(require,module,exports){
+"use strict";
+var utils = require("../utils");
+
+function DataReader(data) {
+ this.data = data; // type : see implementation
+ this.length = data.length;
+ this.index = 0;
+ this.zero = 0;
+}
+DataReader.prototype = {
+ /**
+ * Check that the offset will not go too far.
+ * @param {string} offset the additional offset to check.
+ * @throws {Error} an Error if the offset is out of bounds.
+ */
+ checkOffset: function(offset) {
+ this.checkIndex(this.index + offset);
+ },
+ /**
+ * Check that the specified index will not be too far.
+ * @param {string} newIndex the index to check.
+ * @throws {Error} an Error if the index is out of bounds.
+ */
+ checkIndex: function(newIndex) {
+ if (this.length < this.zero + newIndex || newIndex < 0) {
+ throw new Error("End of data reached (data length = " + this.length + ", asked index = " + (newIndex) + "). Corrupted zip ?");
+ }
+ },
+ /**
+ * Change the index.
+ * @param {number} newIndex The new index.
+ * @throws {Error} if the new index is out of the data.
+ */
+ setIndex: function(newIndex) {
+ this.checkIndex(newIndex);
+ this.index = newIndex;
+ },
+ /**
+ * Skip the next n bytes.
+ * @param {number} n the number of bytes to skip.
+ * @throws {Error} if the new index is out of the data.
+ */
+ skip: function(n) {
+ this.setIndex(this.index + n);
+ },
+ /**
+ * Get the byte at the specified index.
+ * @param {number} i the index to use.
+ * @return {number} a byte.
+ */
+ byteAt: function() {
+ // see implementations
+ },
+ /**
+ * Get the next number with a given byte size.
+ * @param {number} size the number of bytes to read.
+ * @return {number} the corresponding number.
+ */
+ readInt: function(size) {
+ var result = 0,
+ i;
+ this.checkOffset(size);
+ for (i = this.index + size - 1; i >= this.index; i--) {
+ result = (result << 8) + this.byteAt(i);
+ }
+ this.index += size;
+ return result;
+ },
+ /**
+ * Get the next string with a given byte size.
+ * @param {number} size the number of bytes to read.
+ * @return {string} the corresponding string.
+ */
+ readString: function(size) {
+ return utils.transformTo("string", this.readData(size));
+ },
+ /**
+ * Get raw data without conversion, bytes.
+ * @param {number} size the number of bytes to read.
+ * @return {Object} the raw data, implementation specific.
+ */
+ readData: function() {
+ // see implementations
+ },
+ /**
+ * Find the last occurrence of a zip signature (4 bytes).
+ * @param {string} sig the signature to find.
+ * @return {number} the index of the last occurrence, -1 if not found.
+ */
+ lastIndexOfSignature: function() {
+ // see implementations
+ },
+ /**
+ * Read the signature (4 bytes) at the current position and compare it with sig.
+ * @param {string} sig the expected signature
+ * @return {boolean} true if the signature matches, false otherwise.
+ */
+ readAndCheckSignature: function() {
+ // see implementations
+ },
+ /**
+ * Get the next date.
+ * @return {Date} the date.
+ */
+ readDate: function() {
+ var dostime = this.readInt(4);
+ return new Date(Date.UTC(
+ ((dostime >> 25) & 0x7f) + 1980, // year
+ ((dostime >> 21) & 0x0f) - 1, // month
+ (dostime >> 16) & 0x1f, // day
+ (dostime >> 11) & 0x1f, // hour
+ (dostime >> 5) & 0x3f, // minute
+ (dostime & 0x1f) << 1)); // second
+ }
+};
+module.exports = DataReader;
+
+},{"../utils":32}],19:[function(require,module,exports){
+"use strict";
+var Uint8ArrayReader = require("./Uint8ArrayReader");
+var utils = require("../utils");
+
+function NodeBufferReader(data) {
+ Uint8ArrayReader.call(this, data);
+}
+utils.inherits(NodeBufferReader, Uint8ArrayReader);
+
+/**
+ * @see DataReader.readData
+ */
+NodeBufferReader.prototype.readData = function(size) {
+ this.checkOffset(size);
+ var result = this.data.slice(this.zero + this.index, this.zero + this.index + size);
+ this.index += size;
+ return result;
+};
+module.exports = NodeBufferReader;
+
+},{"../utils":32,"./Uint8ArrayReader":21}],20:[function(require,module,exports){
+"use strict";
+var DataReader = require("./DataReader");
+var utils = require("../utils");
+
+function StringReader(data) {
+ DataReader.call(this, data);
+}
+utils.inherits(StringReader, DataReader);
+/**
+ * @see DataReader.byteAt
+ */
+StringReader.prototype.byteAt = function(i) {
+ return this.data.charCodeAt(this.zero + i);
+};
+/**
+ * @see DataReader.lastIndexOfSignature
+ */
+StringReader.prototype.lastIndexOfSignature = function(sig) {
+ return this.data.lastIndexOf(sig) - this.zero;
+};
+/**
+ * @see DataReader.readAndCheckSignature
+ */
+StringReader.prototype.readAndCheckSignature = function (sig) {
+ var data = this.readData(4);
+ return sig === data;
+};
+/**
+ * @see DataReader.readData
+ */
+StringReader.prototype.readData = function(size) {
+ this.checkOffset(size);
+ // this will work because the constructor applied the "& 0xff" mask.
+ var result = this.data.slice(this.zero + this.index, this.zero + this.index + size);
+ this.index += size;
+ return result;
+};
+module.exports = StringReader;
+
+},{"../utils":32,"./DataReader":18}],21:[function(require,module,exports){
+"use strict";
+var ArrayReader = require("./ArrayReader");
+var utils = require("../utils");
+
+function Uint8ArrayReader(data) {
+ ArrayReader.call(this, data);
+}
+utils.inherits(Uint8ArrayReader, ArrayReader);
+/**
+ * @see DataReader.readData
+ */
+Uint8ArrayReader.prototype.readData = function(size) {
+ this.checkOffset(size);
+ if(size === 0) {
+ // in IE10, when using subarray(idx, idx), we get the array [0x00] instead of [].
+ return new Uint8Array(0);
+ }
+ var result = this.data.subarray(this.zero + this.index, this.zero + this.index + size);
+ this.index += size;
+ return result;
+};
+module.exports = Uint8ArrayReader;
+
+},{"../utils":32,"./ArrayReader":17}],22:[function(require,module,exports){
+"use strict";
+
+var utils = require("../utils");
+var support = require("../support");
+var ArrayReader = require("./ArrayReader");
+var StringReader = require("./StringReader");
+var NodeBufferReader = require("./NodeBufferReader");
+var Uint8ArrayReader = require("./Uint8ArrayReader");
+
+/**
+ * Create a reader adapted to the data.
+ * @param {String|ArrayBuffer|Uint8Array|Buffer} data the data to read.
+ * @return {DataReader} the data reader.
+ */
+module.exports = function (data) {
+ var type = utils.getTypeOf(data);
+ utils.checkSupport(type);
+ if (type === "string" && !support.uint8array) {
+ return new StringReader(data);
+ }
+ if (type === "nodebuffer") {
+ return new NodeBufferReader(data);
+ }
+ if (support.uint8array) {
+ return new Uint8ArrayReader(utils.transformTo("uint8array", data));
+ }
+ return new ArrayReader(utils.transformTo("array", data));
+};
+
+},{"../support":30,"../utils":32,"./ArrayReader":17,"./NodeBufferReader":19,"./StringReader":20,"./Uint8ArrayReader":21}],23:[function(require,module,exports){
+"use strict";
+exports.LOCAL_FILE_HEADER = "PK\x03\x04";
+exports.CENTRAL_FILE_HEADER = "PK\x01\x02";
+exports.CENTRAL_DIRECTORY_END = "PK\x05\x06";
+exports.ZIP64_CENTRAL_DIRECTORY_LOCATOR = "PK\x06\x07";
+exports.ZIP64_CENTRAL_DIRECTORY_END = "PK\x06\x06";
+exports.DATA_DESCRIPTOR = "PK\x07\x08";
+
+},{}],24:[function(require,module,exports){
+"use strict";
+
+var GenericWorker = require("./GenericWorker");
+var utils = require("../utils");
+
+/**
+ * A worker which convert chunks to a specified type.
+ * @constructor
+ * @param {String} destType the destination type.
+ */
+function ConvertWorker(destType) {
+ GenericWorker.call(this, "ConvertWorker to " + destType);
+ this.destType = destType;
+}
+utils.inherits(ConvertWorker, GenericWorker);
+
+/**
+ * @see GenericWorker.processChunk
+ */
+ConvertWorker.prototype.processChunk = function (chunk) {
+ this.push({
+ data : utils.transformTo(this.destType, chunk.data),
+ meta : chunk.meta
+ });
+};
+module.exports = ConvertWorker;
+
+},{"../utils":32,"./GenericWorker":28}],25:[function(require,module,exports){
+"use strict";
+
+var GenericWorker = require("./GenericWorker");
+var crc32 = require("../crc32");
+var utils = require("../utils");
+
+/**
+ * A worker which calculate the crc32 of the data flowing through.
+ * @constructor
+ */
+function Crc32Probe() {
+ GenericWorker.call(this, "Crc32Probe");
+ this.withStreamInfo("crc32", 0);
+}
+utils.inherits(Crc32Probe, GenericWorker);
+
+/**
+ * @see GenericWorker.processChunk
+ */
+Crc32Probe.prototype.processChunk = function (chunk) {
+ this.streamInfo.crc32 = crc32(chunk.data, this.streamInfo.crc32 || 0);
+ this.push(chunk);
+};
+module.exports = Crc32Probe;
+
+},{"../crc32":4,"../utils":32,"./GenericWorker":28}],26:[function(require,module,exports){
+"use strict";
+
+var utils = require("../utils");
+var GenericWorker = require("./GenericWorker");
+
+/**
+ * A worker which calculate the total length of the data flowing through.
+ * @constructor
+ * @param {String} propName the name used to expose the length
+ */
+function DataLengthProbe(propName) {
+ GenericWorker.call(this, "DataLengthProbe for " + propName);
+ this.propName = propName;
+ this.withStreamInfo(propName, 0);
+}
+utils.inherits(DataLengthProbe, GenericWorker);
+
+/**
+ * @see GenericWorker.processChunk
+ */
+DataLengthProbe.prototype.processChunk = function (chunk) {
+ if(chunk) {
+ var length = this.streamInfo[this.propName] || 0;
+ this.streamInfo[this.propName] = length + chunk.data.length;
+ }
+ GenericWorker.prototype.processChunk.call(this, chunk);
+};
+module.exports = DataLengthProbe;
+
+
+},{"../utils":32,"./GenericWorker":28}],27:[function(require,module,exports){
+"use strict";
+
+var utils = require("../utils");
+var GenericWorker = require("./GenericWorker");
+
+// the size of the generated chunks
+// TODO expose this as a public variable
+var DEFAULT_BLOCK_SIZE = 16 * 1024;
+
+/**
+ * A worker that reads a content and emits chunks.
+ * @constructor
+ * @param {Promise} dataP the promise of the data to split
+ */
+function DataWorker(dataP) {
+ GenericWorker.call(this, "DataWorker");
+ var self = this;
+ this.dataIsReady = false;
+ this.index = 0;
+ this.max = 0;
+ this.data = null;
+ this.type = "";
+
+ this._tickScheduled = false;
+
+ dataP.then(function (data) {
+ self.dataIsReady = true;
+ self.data = data;
+ self.max = data && data.length || 0;
+ self.type = utils.getTypeOf(data);
+ if(!self.isPaused) {
+ self._tickAndRepeat();
+ }
+ }, function (e) {
+ self.error(e);
+ });
+}
+
+utils.inherits(DataWorker, GenericWorker);
+
+/**
+ * @see GenericWorker.cleanUp
+ */
+DataWorker.prototype.cleanUp = function () {
+ GenericWorker.prototype.cleanUp.call(this);
+ this.data = null;
+};
+
+/**
+ * @see GenericWorker.resume
+ */
+DataWorker.prototype.resume = function () {
+ if(!GenericWorker.prototype.resume.call(this)) {
+ return false;
+ }
+
+ if (!this._tickScheduled && this.dataIsReady) {
+ this._tickScheduled = true;
+ utils.delay(this._tickAndRepeat, [], this);
+ }
+ return true;
+};
+
+/**
+ * Trigger a tick a schedule an other call to this function.
+ */
+DataWorker.prototype._tickAndRepeat = function() {
+ this._tickScheduled = false;
+ if(this.isPaused || this.isFinished) {
+ return;
+ }
+ this._tick();
+ if(!this.isFinished) {
+ utils.delay(this._tickAndRepeat, [], this);
+ this._tickScheduled = true;
+ }
+};
+
+/**
+ * Read and push a chunk.
+ */
+DataWorker.prototype._tick = function() {
+
+ if(this.isPaused || this.isFinished) {
+ return false;
+ }
+
+ var size = DEFAULT_BLOCK_SIZE;
+ var data = null, nextIndex = Math.min(this.max, this.index + size);
+ if (this.index >= this.max) {
+ // EOF
+ return this.end();
+ } else {
+ switch(this.type) {
+ case "string":
+ data = this.data.substring(this.index, nextIndex);
+ break;
+ case "uint8array":
+ data = this.data.subarray(this.index, nextIndex);
+ break;
+ case "array":
+ case "nodebuffer":
+ data = this.data.slice(this.index, nextIndex);
+ break;
+ }
+ this.index = nextIndex;
+ return this.push({
+ data : data,
+ meta : {
+ percent : this.max ? this.index / this.max * 100 : 0
+ }
+ });
+ }
+};
+
+module.exports = DataWorker;
+
+},{"../utils":32,"./GenericWorker":28}],28:[function(require,module,exports){
+"use strict";
+
+/**
+ * A worker that does nothing but passing chunks to the next one. This is like
+ * a nodejs stream but with some differences. On the good side :
+ * - it works on IE 6-9 without any issue / polyfill
+ * - it weights less than the full dependencies bundled with browserify
+ * - it forwards errors (no need to declare an error handler EVERYWHERE)
+ *
+ * A chunk is an object with 2 attributes : `meta` and `data`. The former is an
+ * object containing anything (`percent` for example), see each worker for more
+ * details. The latter is the real data (String, Uint8Array, etc).
+ *
+ * @constructor
+ * @param {String} name the name of the stream (mainly used for debugging purposes)
+ */
+function GenericWorker(name) {
+ // the name of the worker
+ this.name = name || "default";
+ // an object containing metadata about the workers chain
+ this.streamInfo = {};
+ // an error which happened when the worker was paused
+ this.generatedError = null;
+ // an object containing metadata to be merged by this worker into the general metadata
+ this.extraStreamInfo = {};
+ // true if the stream is paused (and should not do anything), false otherwise
+ this.isPaused = true;
+ // true if the stream is finished (and should not do anything), false otherwise
+ this.isFinished = false;
+ // true if the stream is locked to prevent further structure updates (pipe), false otherwise
+ this.isLocked = false;
+ // the event listeners
+ this._listeners = {
+ "data":[],
+ "end":[],
+ "error":[]
+ };
+ // the previous worker, if any
+ this.previous = null;
+}
+
+GenericWorker.prototype = {
+ /**
+ * Push a chunk to the next workers.
+ * @param {Object} chunk the chunk to push
+ */
+ push : function (chunk) {
+ this.emit("data", chunk);
+ },
+ /**
+ * End the stream.
+ * @return {Boolean} true if this call ended the worker, false otherwise.
+ */
+ end : function () {
+ if (this.isFinished) {
+ return false;
+ }
+
+ this.flush();
+ try {
+ this.emit("end");
+ this.cleanUp();
+ this.isFinished = true;
+ } catch (e) {
+ this.emit("error", e);
+ }
+ return true;
+ },
+ /**
+ * End the stream with an error.
+ * @param {Error} e the error which caused the premature end.
+ * @return {Boolean} true if this call ended the worker with an error, false otherwise.
+ */
+ error : function (e) {
+ if (this.isFinished) {
+ return false;
+ }
+
+ if(this.isPaused) {
+ this.generatedError = e;
+ } else {
+ this.isFinished = true;
+
+ this.emit("error", e);
+
+ // in the workers chain exploded in the middle of the chain,
+ // the error event will go downward but we also need to notify
+ // workers upward that there has been an error.
+ if(this.previous) {
+ this.previous.error(e);
+ }
+
+ this.cleanUp();
+ }
+ return true;
+ },
+ /**
+ * Add a callback on an event.
+ * @param {String} name the name of the event (data, end, error)
+ * @param {Function} listener the function to call when the event is triggered
+ * @return {GenericWorker} the current object for chainability
+ */
+ on : function (name, listener) {
+ this._listeners[name].push(listener);
+ return this;
+ },
+ /**
+ * Clean any references when a worker is ending.
+ */
+ cleanUp : function () {
+ this.streamInfo = this.generatedError = this.extraStreamInfo = null;
+ this._listeners = [];
+ },
+ /**
+ * Trigger an event. This will call registered callback with the provided arg.
+ * @param {String} name the name of the event (data, end, error)
+ * @param {Object} arg the argument to call the callback with.
+ */
+ emit : function (name, arg) {
+ if (this._listeners[name]) {
+ for(var i = 0; i < this._listeners[name].length; i++) {
+ this._listeners[name][i].call(this, arg);
+ }
+ }
+ },
+ /**
+ * Chain a worker with an other.
+ * @param {Worker} next the worker receiving events from the current one.
+ * @return {worker} the next worker for chainability
+ */
+ pipe : function (next) {
+ return next.registerPrevious(this);
+ },
+ /**
+ * Same as `pipe` in the other direction.
+ * Using an API with `pipe(next)` is very easy.
+ * Implementing the API with the point of view of the next one registering
+ * a source is easier, see the ZipFileWorker.
+ * @param {Worker} previous the previous worker, sending events to this one
+ * @return {Worker} the current worker for chainability
+ */
+ registerPrevious : function (previous) {
+ if (this.isLocked) {
+ throw new Error("The stream '" + this + "' has already been used.");
+ }
+
+ // sharing the streamInfo...
+ this.streamInfo = previous.streamInfo;
+ // ... and adding our own bits
+ this.mergeStreamInfo();
+ this.previous = previous;
+ var self = this;
+ previous.on("data", function (chunk) {
+ self.processChunk(chunk);
+ });
+ previous.on("end", function () {
+ self.end();
+ });
+ previous.on("error", function (e) {
+ self.error(e);
+ });
+ return this;
+ },
+ /**
+ * Pause the stream so it doesn't send events anymore.
+ * @return {Boolean} true if this call paused the worker, false otherwise.
+ */
+ pause : function () {
+ if(this.isPaused || this.isFinished) {
+ return false;
+ }
+ this.isPaused = true;
+
+ if(this.previous) {
+ this.previous.pause();
+ }
+ return true;
+ },
+ /**
+ * Resume a paused stream.
+ * @return {Boolean} true if this call resumed the worker, false otherwise.
+ */
+ resume : function () {
+ if(!this.isPaused || this.isFinished) {
+ return false;
+ }
+ this.isPaused = false;
+
+ // if true, the worker tried to resume but failed
+ var withError = false;
+ if(this.generatedError) {
+ this.error(this.generatedError);
+ withError = true;
+ }
+ if(this.previous) {
+ this.previous.resume();
+ }
+
+ return !withError;
+ },
+ /**
+ * Flush any remaining bytes as the stream is ending.
+ */
+ flush : function () {},
+ /**
+ * Process a chunk. This is usually the method overridden.
+ * @param {Object} chunk the chunk to process.
+ */
+ processChunk : function(chunk) {
+ this.push(chunk);
+ },
+ /**
+ * Add a key/value to be added in the workers chain streamInfo once activated.
+ * @param {String} key the key to use
+ * @param {Object} value the associated value
+ * @return {Worker} the current worker for chainability
+ */
+ withStreamInfo : function (key, value) {
+ this.extraStreamInfo[key] = value;
+ this.mergeStreamInfo();
+ return this;
+ },
+ /**
+ * Merge this worker's streamInfo into the chain's streamInfo.
+ */
+ mergeStreamInfo : function () {
+ for(var key in this.extraStreamInfo) {
+ if (!Object.prototype.hasOwnProperty.call(this.extraStreamInfo, key)) {
+ continue;
+ }
+ this.streamInfo[key] = this.extraStreamInfo[key];
+ }
+ },
+
+ /**
+ * Lock the stream to prevent further updates on the workers chain.
+ * After calling this method, all calls to pipe will fail.
+ */
+ lock: function () {
+ if (this.isLocked) {
+ throw new Error("The stream '" + this + "' has already been used.");
+ }
+ this.isLocked = true;
+ if (this.previous) {
+ this.previous.lock();
+ }
+ },
+
+ /**
+ *
+ * Pretty print the workers chain.
+ */
+ toString : function () {
+ var me = "Worker " + this.name;
+ if (this.previous) {
+ return this.previous + " -> " + me;
+ } else {
+ return me;
+ }
+ }
+};
+
+module.exports = GenericWorker;
+
+},{}],29:[function(require,module,exports){
+"use strict";
+
+var utils = require("../utils");
+var ConvertWorker = require("./ConvertWorker");
+var GenericWorker = require("./GenericWorker");
+var base64 = require("../base64");
+var support = require("../support");
+var external = require("../external");
+
+var NodejsStreamOutputAdapter = null;
+if (support.nodestream) {
+ try {
+ NodejsStreamOutputAdapter = require("../nodejs/NodejsStreamOutputAdapter");
+ } catch(e) {
+ // ignore
+ }
+}
+
+/**
+ * Apply the final transformation of the data. If the user wants a Blob for
+ * example, it's easier to work with an U8intArray and finally do the
+ * ArrayBuffer/Blob conversion.
+ * @param {String} type the name of the final type
+ * @param {String|Uint8Array|Buffer} content the content to transform
+ * @param {String} mimeType the mime type of the content, if applicable.
+ * @return {String|Uint8Array|ArrayBuffer|Buffer|Blob} the content in the right format.
+ */
+function transformZipOutput(type, content, mimeType) {
+ switch(type) {
+ case "blob" :
+ return utils.newBlob(utils.transformTo("arraybuffer", content), mimeType);
+ case "base64" :
+ return base64.encode(content);
+ default :
+ return utils.transformTo(type, content);
+ }
+}
+
+/**
+ * Concatenate an array of data of the given type.
+ * @param {String} type the type of the data in the given array.
+ * @param {Array} dataArray the array containing the data chunks to concatenate
+ * @return {String|Uint8Array|Buffer} the concatenated data
+ * @throws Error if the asked type is unsupported
+ */
+function concat (type, dataArray) {
+ var i, index = 0, res = null, totalLength = 0;
+ for(i = 0; i < dataArray.length; i++) {
+ totalLength += dataArray[i].length;
+ }
+ switch(type) {
+ case "string":
+ return dataArray.join("");
+ case "array":
+ return Array.prototype.concat.apply([], dataArray);
+ case "uint8array":
+ res = new Uint8Array(totalLength);
+ for(i = 0; i < dataArray.length; i++) {
+ res.set(dataArray[i], index);
+ index += dataArray[i].length;
+ }
+ return res;
+ case "nodebuffer":
+ return Buffer.concat(dataArray);
+ default:
+ throw new Error("concat : unsupported type '" + type + "'");
+ }
+}
+
+/**
+ * Listen a StreamHelper, accumulate its content and concatenate it into a
+ * complete block.
+ * @param {StreamHelper} helper the helper to use.
+ * @param {Function} updateCallback a callback called on each update. Called
+ * with one arg :
+ * - the metadata linked to the update received.
+ * @return Promise the promise for the accumulation.
+ */
+function accumulate(helper, updateCallback) {
+ return new external.Promise(function (resolve, reject){
+ var dataArray = [];
+ var chunkType = helper._internalType,
+ resultType = helper._outputType,
+ mimeType = helper._mimeType;
+ helper
+ .on("data", function (data, meta) {
+ dataArray.push(data);
+ if(updateCallback) {
+ updateCallback(meta);
+ }
+ })
+ .on("error", function(err) {
+ dataArray = [];
+ reject(err);
+ })
+ .on("end", function (){
+ try {
+ var result = transformZipOutput(resultType, concat(chunkType, dataArray), mimeType);
+ resolve(result);
+ } catch (e) {
+ reject(e);
+ }
+ dataArray = [];
+ })
+ .resume();
+ });
+}
+
+/**
+ * An helper to easily use workers outside of JSZip.
+ * @constructor
+ * @param {Worker} worker the worker to wrap
+ * @param {String} outputType the type of data expected by the use
+ * @param {String} mimeType the mime type of the content, if applicable.
+ */
+function StreamHelper(worker, outputType, mimeType) {
+ var internalType = outputType;
+ switch(outputType) {
+ case "blob":
+ case "arraybuffer":
+ internalType = "uint8array";
+ break;
+ case "base64":
+ internalType = "string";
+ break;
+ }
+
+ try {
+ // the type used internally
+ this._internalType = internalType;
+ // the type used to output results
+ this._outputType = outputType;
+ // the mime type
+ this._mimeType = mimeType;
+ utils.checkSupport(internalType);
+ this._worker = worker.pipe(new ConvertWorker(internalType));
+ // the last workers can be rewired without issues but we need to
+ // prevent any updates on previous workers.
+ worker.lock();
+ } catch(e) {
+ this._worker = new GenericWorker("error");
+ this._worker.error(e);
+ }
+}
+
+StreamHelper.prototype = {
+ /**
+ * Listen a StreamHelper, accumulate its content and concatenate it into a
+ * complete block.
+ * @param {Function} updateCb the update callback.
+ * @return Promise the promise for the accumulation.
+ */
+ accumulate : function (updateCb) {
+ return accumulate(this, updateCb);
+ },
+ /**
+ * Add a listener on an event triggered on a stream.
+ * @param {String} evt the name of the event
+ * @param {Function} fn the listener
+ * @return {StreamHelper} the current helper.
+ */
+ on : function (evt, fn) {
+ var self = this;
+
+ if(evt === "data") {
+ this._worker.on(evt, function (chunk) {
+ fn.call(self, chunk.data, chunk.meta);
+ });
+ } else {
+ this._worker.on(evt, function () {
+ utils.delay(fn, arguments, self);
+ });
+ }
+ return this;
+ },
+ /**
+ * Resume the flow of chunks.
+ * @return {StreamHelper} the current helper.
+ */
+ resume : function () {
+ utils.delay(this._worker.resume, [], this._worker);
+ return this;
+ },
+ /**
+ * Pause the flow of chunks.
+ * @return {StreamHelper} the current helper.
+ */
+ pause : function () {
+ this._worker.pause();
+ return this;
+ },
+ /**
+ * Return a nodejs stream for this helper.
+ * @param {Function} updateCb the update callback.
+ * @return {NodejsStreamOutputAdapter} the nodejs stream.
+ */
+ toNodejsStream : function (updateCb) {
+ utils.checkSupport("nodestream");
+ if (this._outputType !== "nodebuffer") {
+ // an object stream containing blob/arraybuffer/uint8array/string
+ // is strange and I don't know if it would be useful.
+ // I you find this comment and have a good usecase, please open a
+ // bug report !
+ throw new Error(this._outputType + " is not supported by this method");
+ }
+
+ return new NodejsStreamOutputAdapter(this, {
+ objectMode : this._outputType !== "nodebuffer"
+ }, updateCb);
+ }
+};
+
+
+module.exports = StreamHelper;
+
+},{"../base64":1,"../external":6,"../nodejs/NodejsStreamOutputAdapter":13,"../support":30,"../utils":32,"./ConvertWorker":24,"./GenericWorker":28}],30:[function(require,module,exports){
+"use strict";
+
+exports.base64 = true;
+exports.array = true;
+exports.string = true;
+exports.arraybuffer = typeof ArrayBuffer !== "undefined" && typeof Uint8Array !== "undefined";
+exports.nodebuffer = typeof Buffer !== "undefined";
+// contains true if JSZip can read/generate Uint8Array, false otherwise.
+exports.uint8array = typeof Uint8Array !== "undefined";
+
+if (typeof ArrayBuffer === "undefined") {
+ exports.blob = false;
+}
+else {
+ var buffer = new ArrayBuffer(0);
+ try {
+ exports.blob = new Blob([buffer], {
+ type: "application/zip"
+ }).size === 0;
+ }
+ catch (e) {
+ try {
+ var Builder = self.BlobBuilder || self.WebKitBlobBuilder || self.MozBlobBuilder || self.MSBlobBuilder;
+ var builder = new Builder();
+ builder.append(buffer);
+ exports.blob = builder.getBlob("application/zip").size === 0;
+ }
+ catch (e) {
+ exports.blob = false;
+ }
+ }
+}
+
+try {
+ exports.nodestream = !!require("readable-stream").Readable;
+} catch(e) {
+ exports.nodestream = false;
+}
+
+},{"readable-stream":16}],31:[function(require,module,exports){
+"use strict";
+
+var utils = require("./utils");
+var support = require("./support");
+var nodejsUtils = require("./nodejsUtils");
+var GenericWorker = require("./stream/GenericWorker");
+
+/**
+ * The following functions come from pako, from pako/lib/utils/strings
+ * released under the MIT license, see pako https://github.com/nodeca/pako/
+ */
+
+// Table with utf8 lengths (calculated by first byte of sequence)
+// Note, that 5 & 6-byte values and some 4-byte values can not be represented in JS,
+// because max possible codepoint is 0x10ffff
+var _utf8len = new Array(256);
+for (var i=0; i<256; i++) {
+ _utf8len[i] = (i >= 252 ? 6 : i >= 248 ? 5 : i >= 240 ? 4 : i >= 224 ? 3 : i >= 192 ? 2 : 1);
+}
+_utf8len[254]=_utf8len[254]=1; // Invalid sequence start
+
+// convert string to array (typed, when possible)
+var string2buf = function (str) {
+ var buf, c, c2, m_pos, i, str_len = str.length, buf_len = 0;
+
+ // count binary size
+ for (m_pos = 0; m_pos < str_len; m_pos++) {
+ c = str.charCodeAt(m_pos);
+ if ((c & 0xfc00) === 0xd800 && (m_pos+1 < str_len)) {
+ c2 = str.charCodeAt(m_pos+1);
+ if ((c2 & 0xfc00) === 0xdc00) {
+ c = 0x10000 + ((c - 0xd800) << 10) + (c2 - 0xdc00);
+ m_pos++;
+ }
+ }
+ buf_len += c < 0x80 ? 1 : c < 0x800 ? 2 : c < 0x10000 ? 3 : 4;
+ }
+
+ // allocate buffer
+ if (support.uint8array) {
+ buf = new Uint8Array(buf_len);
+ } else {
+ buf = new Array(buf_len);
+ }
+
+ // convert
+ for (i=0, m_pos = 0; i < buf_len; m_pos++) {
+ c = str.charCodeAt(m_pos);
+ if ((c & 0xfc00) === 0xd800 && (m_pos+1 < str_len)) {
+ c2 = str.charCodeAt(m_pos+1);
+ if ((c2 & 0xfc00) === 0xdc00) {
+ c = 0x10000 + ((c - 0xd800) << 10) + (c2 - 0xdc00);
+ m_pos++;
+ }
+ }
+ if (c < 0x80) {
+ /* one byte */
+ buf[i++] = c;
+ } else if (c < 0x800) {
+ /* two bytes */
+ buf[i++] = 0xC0 | (c >>> 6);
+ buf[i++] = 0x80 | (c & 0x3f);
+ } else if (c < 0x10000) {
+ /* three bytes */
+ buf[i++] = 0xE0 | (c >>> 12);
+ buf[i++] = 0x80 | (c >>> 6 & 0x3f);
+ buf[i++] = 0x80 | (c & 0x3f);
+ } else {
+ /* four bytes */
+ buf[i++] = 0xf0 | (c >>> 18);
+ buf[i++] = 0x80 | (c >>> 12 & 0x3f);
+ buf[i++] = 0x80 | (c >>> 6 & 0x3f);
+ buf[i++] = 0x80 | (c & 0x3f);
+ }
+ }
+
+ return buf;
+};
+
+// Calculate max possible position in utf8 buffer,
+// that will not break sequence. If that's not possible
+// - (very small limits) return max size as is.
+//
+// buf[] - utf8 bytes array
+// max - length limit (mandatory);
+var utf8border = function(buf, max) {
+ var pos;
+
+ max = max || buf.length;
+ if (max > buf.length) { max = buf.length; }
+
+ // go back from last position, until start of sequence found
+ pos = max-1;
+ while (pos >= 0 && (buf[pos] & 0xC0) === 0x80) { pos--; }
+
+ // Fuckup - very small and broken sequence,
+ // return max, because we should return something anyway.
+ if (pos < 0) { return max; }
+
+ // If we came to start of buffer - that means vuffer is too small,
+ // return max too.
+ if (pos === 0) { return max; }
+
+ return (pos + _utf8len[buf[pos]] > max) ? pos : max;
+};
+
+// convert array to string
+var buf2string = function (buf) {
+ var i, out, c, c_len;
+ var len = buf.length;
+
+ // Reserve max possible length (2 words per char)
+ // NB: by unknown reasons, Array is significantly faster for
+ // String.fromCharCode.apply than Uint16Array.
+ var utf16buf = new Array(len*2);
+
+ for (out=0, i=0; i 4) { utf16buf[out++] = 0xfffd; i += c_len-1; continue; }
+
+ // apply mask on first byte
+ c &= c_len === 2 ? 0x1f : c_len === 3 ? 0x0f : 0x07;
+ // join the rest
+ while (c_len > 1 && i < len) {
+ c = (c << 6) | (buf[i++] & 0x3f);
+ c_len--;
+ }
+
+ // terminated by end of string?
+ if (c_len > 1) { utf16buf[out++] = 0xfffd; continue; }
+
+ if (c < 0x10000) {
+ utf16buf[out++] = c;
+ } else {
+ c -= 0x10000;
+ utf16buf[out++] = 0xd800 | ((c >> 10) & 0x3ff);
+ utf16buf[out++] = 0xdc00 | (c & 0x3ff);
+ }
+ }
+
+ // shrinkBuf(utf16buf, out)
+ if (utf16buf.length !== out) {
+ if(utf16buf.subarray) {
+ utf16buf = utf16buf.subarray(0, out);
+ } else {
+ utf16buf.length = out;
+ }
+ }
+
+ // return String.fromCharCode.apply(null, utf16buf);
+ return utils.applyFromCharCode(utf16buf);
+};
+
+
+// That's all for the pako functions.
+
+
+/**
+ * Transform a javascript string into an array (typed if possible) of bytes,
+ * UTF-8 encoded.
+ * @param {String} str the string to encode
+ * @return {Array|Uint8Array|Buffer} the UTF-8 encoded string.
+ */
+exports.utf8encode = function utf8encode(str) {
+ if (support.nodebuffer) {
+ return nodejsUtils.newBufferFrom(str, "utf-8");
+ }
+
+ return string2buf(str);
+};
+
+
+/**
+ * Transform a bytes array (or a representation) representing an UTF-8 encoded
+ * string into a javascript string.
+ * @param {Array|Uint8Array|Buffer} buf the data de decode
+ * @return {String} the decoded string.
+ */
+exports.utf8decode = function utf8decode(buf) {
+ if (support.nodebuffer) {
+ return utils.transformTo("nodebuffer", buf).toString("utf-8");
+ }
+
+ buf = utils.transformTo(support.uint8array ? "uint8array" : "array", buf);
+
+ return buf2string(buf);
+};
+
+/**
+ * A worker to decode utf8 encoded binary chunks into string chunks.
+ * @constructor
+ */
+function Utf8DecodeWorker() {
+ GenericWorker.call(this, "utf-8 decode");
+ // the last bytes if a chunk didn't end with a complete codepoint.
+ this.leftOver = null;
+}
+utils.inherits(Utf8DecodeWorker, GenericWorker);
+
+/**
+ * @see GenericWorker.processChunk
+ */
+Utf8DecodeWorker.prototype.processChunk = function (chunk) {
+
+ var data = utils.transformTo(support.uint8array ? "uint8array" : "array", chunk.data);
+
+ // 1st step, re-use what's left of the previous chunk
+ if (this.leftOver && this.leftOver.length) {
+ if(support.uint8array) {
+ var previousData = data;
+ data = new Uint8Array(previousData.length + this.leftOver.length);
+ data.set(this.leftOver, 0);
+ data.set(previousData, this.leftOver.length);
+ } else {
+ data = this.leftOver.concat(data);
+ }
+ this.leftOver = null;
+ }
+
+ var nextBoundary = utf8border(data);
+ var usableData = data;
+ if (nextBoundary !== data.length) {
+ if (support.uint8array) {
+ usableData = data.subarray(0, nextBoundary);
+ this.leftOver = data.subarray(nextBoundary, data.length);
+ } else {
+ usableData = data.slice(0, nextBoundary);
+ this.leftOver = data.slice(nextBoundary, data.length);
+ }
+ }
+
+ this.push({
+ data : exports.utf8decode(usableData),
+ meta : chunk.meta
+ });
+};
+
+/**
+ * @see GenericWorker.flush
+ */
+Utf8DecodeWorker.prototype.flush = function () {
+ if(this.leftOver && this.leftOver.length) {
+ this.push({
+ data : exports.utf8decode(this.leftOver),
+ meta : {}
+ });
+ this.leftOver = null;
+ }
+};
+exports.Utf8DecodeWorker = Utf8DecodeWorker;
+
+/**
+ * A worker to endcode string chunks into utf8 encoded binary chunks.
+ * @constructor
+ */
+function Utf8EncodeWorker() {
+ GenericWorker.call(this, "utf-8 encode");
+}
+utils.inherits(Utf8EncodeWorker, GenericWorker);
+
+/**
+ * @see GenericWorker.processChunk
+ */
+Utf8EncodeWorker.prototype.processChunk = function (chunk) {
+ this.push({
+ data : exports.utf8encode(chunk.data),
+ meta : chunk.meta
+ });
+};
+exports.Utf8EncodeWorker = Utf8EncodeWorker;
+
+},{"./nodejsUtils":14,"./stream/GenericWorker":28,"./support":30,"./utils":32}],32:[function(require,module,exports){
+"use strict";
+
+var support = require("./support");
+var base64 = require("./base64");
+var nodejsUtils = require("./nodejsUtils");
+var external = require("./external");
+require("setimmediate");
+
+
+/**
+ * Convert a string that pass as a "binary string": it should represent a byte
+ * array but may have > 255 char codes. Be sure to take only the first byte
+ * and returns the byte array.
+ * @param {String} str the string to transform.
+ * @return {Array|Uint8Array} the string in a binary format.
+ */
+function string2binary(str) {
+ var result = null;
+ if (support.uint8array) {
+ result = new Uint8Array(str.length);
+ } else {
+ result = new Array(str.length);
+ }
+ return stringToArrayLike(str, result);
+}
+
+/**
+ * Create a new blob with the given content and the given type.
+ * @param {String|ArrayBuffer} part the content to put in the blob. DO NOT use
+ * an Uint8Array because the stock browser of android 4 won't accept it (it
+ * will be silently converted to a string, "[object Uint8Array]").
+ *
+ * Use only ONE part to build the blob to avoid a memory leak in IE11 / Edge:
+ * when a large amount of Array is used to create the Blob, the amount of
+ * memory consumed is nearly 100 times the original data amount.
+ *
+ * @param {String} type the mime type of the blob.
+ * @return {Blob} the created blob.
+ */
+exports.newBlob = function(part, type) {
+ exports.checkSupport("blob");
+
+ try {
+ // Blob constructor
+ return new Blob([part], {
+ type: type
+ });
+ }
+ catch (e) {
+
+ try {
+ // deprecated, browser only, old way
+ var Builder = self.BlobBuilder || self.WebKitBlobBuilder || self.MozBlobBuilder || self.MSBlobBuilder;
+ var builder = new Builder();
+ builder.append(part);
+ return builder.getBlob(type);
+ }
+ catch (e) {
+
+ // well, fuck ?!
+ throw new Error("Bug : can't construct the Blob.");
+ }
+ }
+
+
+};
+/**
+ * The identity function.
+ * @param {Object} input the input.
+ * @return {Object} the same input.
+ */
+function identity(input) {
+ return input;
+}
+
+/**
+ * Fill in an array with a string.
+ * @param {String} str the string to use.
+ * @param {Array|ArrayBuffer|Uint8Array|Buffer} array the array to fill in (will be mutated).
+ * @return {Array|ArrayBuffer|Uint8Array|Buffer} the updated array.
+ */
+function stringToArrayLike(str, array) {
+ for (var i = 0; i < str.length; ++i) {
+ array[i] = str.charCodeAt(i) & 0xFF;
+ }
+ return array;
+}
+
+/**
+ * An helper for the function arrayLikeToString.
+ * This contains static information and functions that
+ * can be optimized by the browser JIT compiler.
+ */
+var arrayToStringHelper = {
+ /**
+ * Transform an array of int into a string, chunk by chunk.
+ * See the performances notes on arrayLikeToString.
+ * @param {Array|ArrayBuffer|Uint8Array|Buffer} array the array to transform.
+ * @param {String} type the type of the array.
+ * @param {Integer} chunk the chunk size.
+ * @return {String} the resulting string.
+ * @throws Error if the chunk is too big for the stack.
+ */
+ stringifyByChunk: function(array, type, chunk) {
+ var result = [], k = 0, len = array.length;
+ // shortcut
+ if (len <= chunk) {
+ return String.fromCharCode.apply(null, array);
+ }
+ while (k < len) {
+ if (type === "array" || type === "nodebuffer") {
+ result.push(String.fromCharCode.apply(null, array.slice(k, Math.min(k + chunk, len))));
+ }
+ else {
+ result.push(String.fromCharCode.apply(null, array.subarray(k, Math.min(k + chunk, len))));
+ }
+ k += chunk;
+ }
+ return result.join("");
+ },
+ /**
+ * Call String.fromCharCode on every item in the array.
+ * This is the naive implementation, which generate A LOT of intermediate string.
+ * This should be used when everything else fail.
+ * @param {Array|ArrayBuffer|Uint8Array|Buffer} array the array to transform.
+ * @return {String} the result.
+ */
+ stringifyByChar: function(array){
+ var resultStr = "";
+ for(var i = 0; i < array.length; i++) {
+ resultStr += String.fromCharCode(array[i]);
+ }
+ return resultStr;
+ },
+ applyCanBeUsed : {
+ /**
+ * true if the browser accepts to use String.fromCharCode on Uint8Array
+ */
+ uint8array : (function () {
+ try {
+ return support.uint8array && String.fromCharCode.apply(null, new Uint8Array(1)).length === 1;
+ } catch (e) {
+ return false;
+ }
+ })(),
+ /**
+ * true if the browser accepts to use String.fromCharCode on nodejs Buffer.
+ */
+ nodebuffer : (function () {
+ try {
+ return support.nodebuffer && String.fromCharCode.apply(null, nodejsUtils.allocBuffer(1)).length === 1;
+ } catch (e) {
+ return false;
+ }
+ })()
+ }
+};
+
+/**
+ * Transform an array-like object to a string.
+ * @param {Array|ArrayBuffer|Uint8Array|Buffer} array the array to transform.
+ * @return {String} the result.
+ */
+function arrayLikeToString(array) {
+ // Performances notes :
+ // --------------------
+ // String.fromCharCode.apply(null, array) is the fastest, see
+ // see http://jsperf.com/converting-a-uint8array-to-a-string/2
+ // but the stack is limited (and we can get huge arrays !).
+ //
+ // result += String.fromCharCode(array[i]); generate too many strings !
+ //
+ // This code is inspired by http://jsperf.com/arraybuffer-to-string-apply-performance/2
+ // TODO : we now have workers that split the work. Do we still need that ?
+ var chunk = 65536,
+ type = exports.getTypeOf(array),
+ canUseApply = true;
+ if (type === "uint8array") {
+ canUseApply = arrayToStringHelper.applyCanBeUsed.uint8array;
+ } else if (type === "nodebuffer") {
+ canUseApply = arrayToStringHelper.applyCanBeUsed.nodebuffer;
+ }
+
+ if (canUseApply) {
+ while (chunk > 1) {
+ try {
+ return arrayToStringHelper.stringifyByChunk(array, type, chunk);
+ } catch (e) {
+ chunk = Math.floor(chunk / 2);
+ }
+ }
+ }
+
+ // no apply or chunk error : slow and painful algorithm
+ // default browser on android 4.*
+ return arrayToStringHelper.stringifyByChar(array);
+}
+
+exports.applyFromCharCode = arrayLikeToString;
+
+
+/**
+ * Copy the data from an array-like to an other array-like.
+ * @param {Array|ArrayBuffer|Uint8Array|Buffer} arrayFrom the origin array.
+ * @param {Array|ArrayBuffer|Uint8Array|Buffer} arrayTo the destination array which will be mutated.
+ * @return {Array|ArrayBuffer|Uint8Array|Buffer} the updated destination array.
+ */
+function arrayLikeToArrayLike(arrayFrom, arrayTo) {
+ for (var i = 0; i < arrayFrom.length; i++) {
+ arrayTo[i] = arrayFrom[i];
+ }
+ return arrayTo;
+}
+
+// a matrix containing functions to transform everything into everything.
+var transform = {};
+
+// string to ?
+transform["string"] = {
+ "string": identity,
+ "array": function(input) {
+ return stringToArrayLike(input, new Array(input.length));
+ },
+ "arraybuffer": function(input) {
+ return transform["string"]["uint8array"](input).buffer;
+ },
+ "uint8array": function(input) {
+ return stringToArrayLike(input, new Uint8Array(input.length));
+ },
+ "nodebuffer": function(input) {
+ return stringToArrayLike(input, nodejsUtils.allocBuffer(input.length));
+ }
+};
+
+// array to ?
+transform["array"] = {
+ "string": arrayLikeToString,
+ "array": identity,
+ "arraybuffer": function(input) {
+ return (new Uint8Array(input)).buffer;
+ },
+ "uint8array": function(input) {
+ return new Uint8Array(input);
+ },
+ "nodebuffer": function(input) {
+ return nodejsUtils.newBufferFrom(input);
+ }
+};
+
+// arraybuffer to ?
+transform["arraybuffer"] = {
+ "string": function(input) {
+ return arrayLikeToString(new Uint8Array(input));
+ },
+ "array": function(input) {
+ return arrayLikeToArrayLike(new Uint8Array(input), new Array(input.byteLength));
+ },
+ "arraybuffer": identity,
+ "uint8array": function(input) {
+ return new Uint8Array(input);
+ },
+ "nodebuffer": function(input) {
+ return nodejsUtils.newBufferFrom(new Uint8Array(input));
+ }
+};
+
+// uint8array to ?
+transform["uint8array"] = {
+ "string": arrayLikeToString,
+ "array": function(input) {
+ return arrayLikeToArrayLike(input, new Array(input.length));
+ },
+ "arraybuffer": function(input) {
+ return input.buffer;
+ },
+ "uint8array": identity,
+ "nodebuffer": function(input) {
+ return nodejsUtils.newBufferFrom(input);
+ }
+};
+
+// nodebuffer to ?
+transform["nodebuffer"] = {
+ "string": arrayLikeToString,
+ "array": function(input) {
+ return arrayLikeToArrayLike(input, new Array(input.length));
+ },
+ "arraybuffer": function(input) {
+ return transform["nodebuffer"]["uint8array"](input).buffer;
+ },
+ "uint8array": function(input) {
+ return arrayLikeToArrayLike(input, new Uint8Array(input.length));
+ },
+ "nodebuffer": identity
+};
+
+/**
+ * Transform an input into any type.
+ * The supported output type are : string, array, uint8array, arraybuffer, nodebuffer.
+ * If no output type is specified, the unmodified input will be returned.
+ * @param {String} outputType the output type.
+ * @param {String|Array|ArrayBuffer|Uint8Array|Buffer} input the input to convert.
+ * @throws {Error} an Error if the browser doesn't support the requested output type.
+ */
+exports.transformTo = function(outputType, input) {
+ if (!input) {
+ // undefined, null, etc
+ // an empty string won't harm.
+ input = "";
+ }
+ if (!outputType) {
+ return input;
+ }
+ exports.checkSupport(outputType);
+ var inputType = exports.getTypeOf(input);
+ var result = transform[inputType][outputType](input);
+ return result;
+};
+
+/**
+ * Resolve all relative path components, "." and "..", in a path. If these relative components
+ * traverse above the root then the resulting path will only contain the final path component.
+ *
+ * All empty components, e.g. "//", are removed.
+ * @param {string} path A path with / or \ separators
+ * @returns {string} The path with all relative path components resolved.
+ */
+exports.resolve = function(path) {
+ var parts = path.split("/");
+ var result = [];
+ for (var index = 0; index < parts.length; index++) {
+ var part = parts[index];
+ // Allow the first and last component to be empty for trailing slashes.
+ if (part === "." || (part === "" && index !== 0 && index !== parts.length - 1)) {
+ continue;
+ } else if (part === "..") {
+ result.pop();
+ } else {
+ result.push(part);
+ }
+ }
+ return result.join("/");
+};
+
+/**
+ * Return the type of the input.
+ * The type will be in a format valid for JSZip.utils.transformTo : string, array, uint8array, arraybuffer.
+ * @param {Object} input the input to identify.
+ * @return {String} the (lowercase) type of the input.
+ */
+exports.getTypeOf = function(input) {
+ if (typeof input === "string") {
+ return "string";
+ }
+ if (Object.prototype.toString.call(input) === "[object Array]") {
+ return "array";
+ }
+ if (support.nodebuffer && nodejsUtils.isBuffer(input)) {
+ return "nodebuffer";
+ }
+ if (support.uint8array && input instanceof Uint8Array) {
+ return "uint8array";
+ }
+ if (support.arraybuffer && input instanceof ArrayBuffer) {
+ return "arraybuffer";
+ }
+};
+
+/**
+ * Throw an exception if the type is not supported.
+ * @param {String} type the type to check.
+ * @throws {Error} an Error if the browser doesn't support the requested type.
+ */
+exports.checkSupport = function(type) {
+ var supported = support[type.toLowerCase()];
+ if (!supported) {
+ throw new Error(type + " is not supported by this platform");
+ }
+};
+
+exports.MAX_VALUE_16BITS = 65535;
+exports.MAX_VALUE_32BITS = -1; // well, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF" is parsed as -1
+
+/**
+ * Prettify a string read as binary.
+ * @param {string} str the string to prettify.
+ * @return {string} a pretty string.
+ */
+exports.pretty = function(str) {
+ var res = "",
+ code, i;
+ for (i = 0; i < (str || "").length; i++) {
+ code = str.charCodeAt(i);
+ res += "\\x" + (code < 16 ? "0" : "") + code.toString(16).toUpperCase();
+ }
+ return res;
+};
+
+/**
+ * Defer the call of a function.
+ * @param {Function} callback the function to call asynchronously.
+ * @param {Array} args the arguments to give to the callback.
+ */
+exports.delay = function(callback, args, self) {
+ setImmediate(function () {
+ callback.apply(self || null, args || []);
+ });
+};
+
+/**
+ * Extends a prototype with an other, without calling a constructor with
+ * side effects. Inspired by nodejs' `utils.inherits`
+ * @param {Function} ctor the constructor to augment
+ * @param {Function} superCtor the parent constructor to use
+ */
+exports.inherits = function (ctor, superCtor) {
+ var Obj = function() {};
+ Obj.prototype = superCtor.prototype;
+ ctor.prototype = new Obj();
+};
+
+/**
+ * Merge the objects passed as parameters into a new one.
+ * @private
+ * @param {...Object} var_args All objects to merge.
+ * @return {Object} a new object with the data of the others.
+ */
+exports.extend = function() {
+ var result = {}, i, attr;
+ for (i = 0; i < arguments.length; i++) { // arguments is not enumerable in some browsers
+ for (attr in arguments[i]) {
+ if (Object.prototype.hasOwnProperty.call(arguments[i], attr) && typeof result[attr] === "undefined") {
+ result[attr] = arguments[i][attr];
+ }
+ }
+ }
+ return result;
+};
+
+/**
+ * Transform arbitrary content into a Promise.
+ * @param {String} name a name for the content being processed.
+ * @param {Object} inputData the content to process.
+ * @param {Boolean} isBinary true if the content is not an unicode string
+ * @param {Boolean} isOptimizedBinaryString true if the string content only has one byte per character.
+ * @param {Boolean} isBase64 true if the string content is encoded with base64.
+ * @return {Promise} a promise in a format usable by JSZip.
+ */
+exports.prepareContent = function(name, inputData, isBinary, isOptimizedBinaryString, isBase64) {
+
+ // if inputData is already a promise, this flatten it.
+ var promise = external.Promise.resolve(inputData).then(function(data) {
+
+
+ var isBlob = support.blob && (data instanceof Blob || ["[object File]", "[object Blob]"].indexOf(Object.prototype.toString.call(data)) !== -1);
+
+ if (isBlob && typeof FileReader !== "undefined") {
+ return new external.Promise(function (resolve, reject) {
+ var reader = new FileReader();
+
+ reader.onload = function(e) {
+ resolve(e.target.result);
+ };
+ reader.onerror = function(e) {
+ reject(e.target.error);
+ };
+ reader.readAsArrayBuffer(data);
+ });
+ } else {
+ return data;
+ }
+ });
+
+ return promise.then(function(data) {
+ var dataType = exports.getTypeOf(data);
+
+ if (!dataType) {
+ return external.Promise.reject(
+ new Error("Can't read the data of '" + name + "'. Is it " +
+ "in a supported JavaScript type (String, Blob, ArrayBuffer, etc) ?")
+ );
+ }
+ // special case : it's way easier to work with Uint8Array than with ArrayBuffer
+ if (dataType === "arraybuffer") {
+ data = exports.transformTo("uint8array", data);
+ } else if (dataType === "string") {
+ if (isBase64) {
+ data = base64.decode(data);
+ }
+ else if (isBinary) {
+ // optimizedBinaryString === true means that the file has already been filtered with a 0xFF mask
+ if (isOptimizedBinaryString !== true) {
+ // this is a string, not in a base64 format.
+ // Be sure that this is a correct "binary string"
+ data = string2binary(data);
+ }
+ }
+ }
+ return data;
+ });
+};
+
+},{"./base64":1,"./external":6,"./nodejsUtils":14,"./support":30,"setimmediate":54}],33:[function(require,module,exports){
+"use strict";
+var readerFor = require("./reader/readerFor");
+var utils = require("./utils");
+var sig = require("./signature");
+var ZipEntry = require("./zipEntry");
+var support = require("./support");
+// class ZipEntries {{{
+/**
+ * All the entries in the zip file.
+ * @constructor
+ * @param {Object} loadOptions Options for loading the stream.
+ */
+function ZipEntries(loadOptions) {
+ this.files = [];
+ this.loadOptions = loadOptions;
+}
+ZipEntries.prototype = {
+ /**
+ * Check that the reader is on the specified signature.
+ * @param {string} expectedSignature the expected signature.
+ * @throws {Error} if it is an other signature.
+ */
+ checkSignature: function(expectedSignature) {
+ if (!this.reader.readAndCheckSignature(expectedSignature)) {
+ this.reader.index -= 4;
+ var signature = this.reader.readString(4);
+ throw new Error("Corrupted zip or bug: unexpected signature " + "(" + utils.pretty(signature) + ", expected " + utils.pretty(expectedSignature) + ")");
+ }
+ },
+ /**
+ * Check if the given signature is at the given index.
+ * @param {number} askedIndex the index to check.
+ * @param {string} expectedSignature the signature to expect.
+ * @return {boolean} true if the signature is here, false otherwise.
+ */
+ isSignature: function(askedIndex, expectedSignature) {
+ var currentIndex = this.reader.index;
+ this.reader.setIndex(askedIndex);
+ var signature = this.reader.readString(4);
+ var result = signature === expectedSignature;
+ this.reader.setIndex(currentIndex);
+ return result;
+ },
+ /**
+ * Read the end of the central directory.
+ */
+ readBlockEndOfCentral: function() {
+ this.diskNumber = this.reader.readInt(2);
+ this.diskWithCentralDirStart = this.reader.readInt(2);
+ this.centralDirRecordsOnThisDisk = this.reader.readInt(2);
+ this.centralDirRecords = this.reader.readInt(2);
+ this.centralDirSize = this.reader.readInt(4);
+ this.centralDirOffset = this.reader.readInt(4);
+
+ this.zipCommentLength = this.reader.readInt(2);
+ // warning : the encoding depends of the system locale
+ // On a linux machine with LANG=en_US.utf8, this field is utf8 encoded.
+ // On a windows machine, this field is encoded with the localized windows code page.
+ var zipComment = this.reader.readData(this.zipCommentLength);
+ var decodeParamType = support.uint8array ? "uint8array" : "array";
+ // To get consistent behavior with the generation part, we will assume that
+ // this is utf8 encoded unless specified otherwise.
+ var decodeContent = utils.transformTo(decodeParamType, zipComment);
+ this.zipComment = this.loadOptions.decodeFileName(decodeContent);
+ },
+ /**
+ * Read the end of the Zip 64 central directory.
+ * Not merged with the method readEndOfCentral :
+ * The end of central can coexist with its Zip64 brother,
+ * I don't want to read the wrong number of bytes !
+ */
+ readBlockZip64EndOfCentral: function() {
+ this.zip64EndOfCentralSize = this.reader.readInt(8);
+ this.reader.skip(4);
+ // this.versionMadeBy = this.reader.readString(2);
+ // this.versionNeeded = this.reader.readInt(2);
+ this.diskNumber = this.reader.readInt(4);
+ this.diskWithCentralDirStart = this.reader.readInt(4);
+ this.centralDirRecordsOnThisDisk = this.reader.readInt(8);
+ this.centralDirRecords = this.reader.readInt(8);
+ this.centralDirSize = this.reader.readInt(8);
+ this.centralDirOffset = this.reader.readInt(8);
+
+ this.zip64ExtensibleData = {};
+ var extraDataSize = this.zip64EndOfCentralSize - 44,
+ index = 0,
+ extraFieldId,
+ extraFieldLength,
+ extraFieldValue;
+ while (index < extraDataSize) {
+ extraFieldId = this.reader.readInt(2);
+ extraFieldLength = this.reader.readInt(4);
+ extraFieldValue = this.reader.readData(extraFieldLength);
+ this.zip64ExtensibleData[extraFieldId] = {
+ id: extraFieldId,
+ length: extraFieldLength,
+ value: extraFieldValue
+ };
+ }
+ },
+ /**
+ * Read the end of the Zip 64 central directory locator.
+ */
+ readBlockZip64EndOfCentralLocator: function() {
+ this.diskWithZip64CentralDirStart = this.reader.readInt(4);
+ this.relativeOffsetEndOfZip64CentralDir = this.reader.readInt(8);
+ this.disksCount = this.reader.readInt(4);
+ if (this.disksCount > 1) {
+ throw new Error("Multi-volumes zip are not supported");
+ }
+ },
+ /**
+ * Read the local files, based on the offset read in the central part.
+ */
+ readLocalFiles: function() {
+ var i, file;
+ for (i = 0; i < this.files.length; i++) {
+ file = this.files[i];
+ this.reader.setIndex(file.localHeaderOffset);
+ this.checkSignature(sig.LOCAL_FILE_HEADER);
+ file.readLocalPart(this.reader);
+ file.handleUTF8();
+ file.processAttributes();
+ }
+ },
+ /**
+ * Read the central directory.
+ */
+ readCentralDir: function() {
+ var file;
+
+ this.reader.setIndex(this.centralDirOffset);
+ while (this.reader.readAndCheckSignature(sig.CENTRAL_FILE_HEADER)) {
+ file = new ZipEntry({
+ zip64: this.zip64
+ }, this.loadOptions);
+ file.readCentralPart(this.reader);
+ this.files.push(file);
+ }
+
+ if (this.centralDirRecords !== this.files.length) {
+ if (this.centralDirRecords !== 0 && this.files.length === 0) {
+ // We expected some records but couldn't find ANY.
+ // This is really suspicious, as if something went wrong.
+ throw new Error("Corrupted zip or bug: expected " + this.centralDirRecords + " records in central dir, got " + this.files.length);
+ } else {
+ // We found some records but not all.
+ // Something is wrong but we got something for the user: no error here.
+ // console.warn("expected", this.centralDirRecords, "records in central dir, got", this.files.length);
+ }
+ }
+ },
+ /**
+ * Read the end of central directory.
+ */
+ readEndOfCentral: function() {
+ var offset = this.reader.lastIndexOfSignature(sig.CENTRAL_DIRECTORY_END);
+ if (offset < 0) {
+ // Check if the content is a truncated zip or complete garbage.
+ // A "LOCAL_FILE_HEADER" is not required at the beginning (auto
+ // extractible zip for example) but it can give a good hint.
+ // If an ajax request was used without responseType, we will also
+ // get unreadable data.
+ var isGarbage = !this.isSignature(0, sig.LOCAL_FILE_HEADER);
+
+ if (isGarbage) {
+ throw new Error("Can't find end of central directory : is this a zip file ? " +
+ "If it is, see https://stuk.github.io/jszip/documentation/howto/read_zip.html");
+ } else {
+ throw new Error("Corrupted zip: can't find end of central directory");
+ }
+
+ }
+ this.reader.setIndex(offset);
+ var endOfCentralDirOffset = offset;
+ this.checkSignature(sig.CENTRAL_DIRECTORY_END);
+ this.readBlockEndOfCentral();
+
+
+ /* extract from the zip spec :
+ 4) If one of the fields in the end of central directory
+ record is too small to hold required data, the field
+ should be set to -1 (0xFFFF or 0xFFFFFFFF) and the
+ ZIP64 format record should be created.
+ 5) The end of central directory record and the
+ Zip64 end of central directory locator record must
+ reside on the same disk when splitting or spanning
+ an archive.
+ */
+ if (this.diskNumber === utils.MAX_VALUE_16BITS || this.diskWithCentralDirStart === utils.MAX_VALUE_16BITS || this.centralDirRecordsOnThisDisk === utils.MAX_VALUE_16BITS || this.centralDirRecords === utils.MAX_VALUE_16BITS || this.centralDirSize === utils.MAX_VALUE_32BITS || this.centralDirOffset === utils.MAX_VALUE_32BITS) {
+ this.zip64 = true;
+
+ /*
+ Warning : the zip64 extension is supported, but ONLY if the 64bits integer read from
+ the zip file can fit into a 32bits integer. This cannot be solved : JavaScript represents
+ all numbers as 64-bit double precision IEEE 754 floating point numbers.
+ So, we have 53bits for integers and bitwise operations treat everything as 32bits.
+ see https://developer.mozilla.org/en-US/docs/JavaScript/Reference/Operators/Bitwise_Operators
+ and http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf section 8.5
+ */
+
+ // should look for a zip64 EOCD locator
+ offset = this.reader.lastIndexOfSignature(sig.ZIP64_CENTRAL_DIRECTORY_LOCATOR);
+ if (offset < 0) {
+ throw new Error("Corrupted zip: can't find the ZIP64 end of central directory locator");
+ }
+ this.reader.setIndex(offset);
+ this.checkSignature(sig.ZIP64_CENTRAL_DIRECTORY_LOCATOR);
+ this.readBlockZip64EndOfCentralLocator();
+
+ // now the zip64 EOCD record
+ if (!this.isSignature(this.relativeOffsetEndOfZip64CentralDir, sig.ZIP64_CENTRAL_DIRECTORY_END)) {
+ // console.warn("ZIP64 end of central directory not where expected.");
+ this.relativeOffsetEndOfZip64CentralDir = this.reader.lastIndexOfSignature(sig.ZIP64_CENTRAL_DIRECTORY_END);
+ if (this.relativeOffsetEndOfZip64CentralDir < 0) {
+ throw new Error("Corrupted zip: can't find the ZIP64 end of central directory");
+ }
+ }
+ this.reader.setIndex(this.relativeOffsetEndOfZip64CentralDir);
+ this.checkSignature(sig.ZIP64_CENTRAL_DIRECTORY_END);
+ this.readBlockZip64EndOfCentral();
+ }
+
+ var expectedEndOfCentralDirOffset = this.centralDirOffset + this.centralDirSize;
+ if (this.zip64) {
+ expectedEndOfCentralDirOffset += 20; // end of central dir 64 locator
+ expectedEndOfCentralDirOffset += 12 /* should not include the leading 12 bytes */ + this.zip64EndOfCentralSize;
+ }
+
+ var extraBytes = endOfCentralDirOffset - expectedEndOfCentralDirOffset;
+
+ if (extraBytes > 0) {
+ // console.warn(extraBytes, "extra bytes at beginning or within zipfile");
+ if (this.isSignature(endOfCentralDirOffset, sig.CENTRAL_FILE_HEADER)) {
+ // The offsets seem wrong, but we have something at the specified offset.
+ // So… we keep it.
+ } else {
+ // the offset is wrong, update the "zero" of the reader
+ // this happens if data has been prepended (crx files for example)
+ this.reader.zero = extraBytes;
+ }
+ } else if (extraBytes < 0) {
+ throw new Error("Corrupted zip: missing " + Math.abs(extraBytes) + " bytes.");
+ }
+ },
+ prepareReader: function(data) {
+ this.reader = readerFor(data);
+ },
+ /**
+ * Read a zip file and create ZipEntries.
+ * @param {String|ArrayBuffer|Uint8Array|Buffer} data the binary string representing a zip file.
+ */
+ load: function(data) {
+ this.prepareReader(data);
+ this.readEndOfCentral();
+ this.readCentralDir();
+ this.readLocalFiles();
+ }
+};
+// }}} end of ZipEntries
+module.exports = ZipEntries;
+
+},{"./reader/readerFor":22,"./signature":23,"./support":30,"./utils":32,"./zipEntry":34}],34:[function(require,module,exports){
+"use strict";
+var readerFor = require("./reader/readerFor");
+var utils = require("./utils");
+var CompressedObject = require("./compressedObject");
+var crc32fn = require("./crc32");
+var utf8 = require("./utf8");
+var compressions = require("./compressions");
+var support = require("./support");
+
+var MADE_BY_DOS = 0x00;
+var MADE_BY_UNIX = 0x03;
+
+/**
+ * Find a compression registered in JSZip.
+ * @param {string} compressionMethod the method magic to find.
+ * @return {Object|null} the JSZip compression object, null if none found.
+ */
+var findCompression = function(compressionMethod) {
+ for (var method in compressions) {
+ if (!Object.prototype.hasOwnProperty.call(compressions, method)) {
+ continue;
+ }
+ if (compressions[method].magic === compressionMethod) {
+ return compressions[method];
+ }
+ }
+ return null;
+};
+
+// class ZipEntry {{{
+/**
+ * An entry in the zip file.
+ * @constructor
+ * @param {Object} options Options of the current file.
+ * @param {Object} loadOptions Options for loading the stream.
+ */
+function ZipEntry(options, loadOptions) {
+ this.options = options;
+ this.loadOptions = loadOptions;
+}
+ZipEntry.prototype = {
+ /**
+ * say if the file is encrypted.
+ * @return {boolean} true if the file is encrypted, false otherwise.
+ */
+ isEncrypted: function() {
+ // bit 1 is set
+ return (this.bitFlag & 0x0001) === 0x0001;
+ },
+ /**
+ * say if the file has utf-8 filename/comment.
+ * @return {boolean} true if the filename/comment is in utf-8, false otherwise.
+ */
+ useUTF8: function() {
+ // bit 11 is set
+ return (this.bitFlag & 0x0800) === 0x0800;
+ },
+ /**
+ * Read the local part of a zip file and add the info in this object.
+ * @param {DataReader} reader the reader to use.
+ */
+ readLocalPart: function(reader) {
+ var compression, localExtraFieldsLength;
+
+ // we already know everything from the central dir !
+ // If the central dir data are false, we are doomed.
+ // On the bright side, the local part is scary : zip64, data descriptors, both, etc.
+ // The less data we get here, the more reliable this should be.
+ // Let's skip the whole header and dash to the data !
+ reader.skip(22);
+ // in some zip created on windows, the filename stored in the central dir contains \ instead of /.
+ // Strangely, the filename here is OK.
+ // I would love to treat these zip files as corrupted (see http://www.info-zip.org/FAQ.html#backslashes
+ // or APPNOTE#4.4.17.1, "All slashes MUST be forward slashes '/'") but there are a lot of bad zip generators...
+ // Search "unzip mismatching "local" filename continuing with "central" filename version" on
+ // the internet.
+ //
+ // I think I see the logic here : the central directory is used to display
+ // content and the local directory is used to extract the files. Mixing / and \
+ // may be used to display \ to windows users and use / when extracting the files.
+ // Unfortunately, this lead also to some issues : http://seclists.org/fulldisclosure/2009/Sep/394
+ this.fileNameLength = reader.readInt(2);
+ localExtraFieldsLength = reader.readInt(2); // can't be sure this will be the same as the central dir
+ // the fileName is stored as binary data, the handleUTF8 method will take care of the encoding.
+ this.fileName = reader.readData(this.fileNameLength);
+ reader.skip(localExtraFieldsLength);
+
+ if (this.compressedSize === -1 || this.uncompressedSize === -1) {
+ throw new Error("Bug or corrupted zip : didn't get enough information from the central directory " + "(compressedSize === -1 || uncompressedSize === -1)");
+ }
+
+ compression = findCompression(this.compressionMethod);
+ if (compression === null) { // no compression found
+ throw new Error("Corrupted zip : compression " + utils.pretty(this.compressionMethod) + " unknown (inner file : " + utils.transformTo("string", this.fileName) + ")");
+ }
+ this.decompressed = new CompressedObject(this.compressedSize, this.uncompressedSize, this.crc32, compression, reader.readData(this.compressedSize));
+ },
+
+ /**
+ * Read the central part of a zip file and add the info in this object.
+ * @param {DataReader} reader the reader to use.
+ */
+ readCentralPart: function(reader) {
+ this.versionMadeBy = reader.readInt(2);
+ reader.skip(2);
+ // this.versionNeeded = reader.readInt(2);
+ this.bitFlag = reader.readInt(2);
+ this.compressionMethod = reader.readString(2);
+ this.date = reader.readDate();
+ this.crc32 = reader.readInt(4);
+ this.compressedSize = reader.readInt(4);
+ this.uncompressedSize = reader.readInt(4);
+ var fileNameLength = reader.readInt(2);
+ this.extraFieldsLength = reader.readInt(2);
+ this.fileCommentLength = reader.readInt(2);
+ this.diskNumberStart = reader.readInt(2);
+ this.internalFileAttributes = reader.readInt(2);
+ this.externalFileAttributes = reader.readInt(4);
+ this.localHeaderOffset = reader.readInt(4);
+
+ if (this.isEncrypted()) {
+ throw new Error("Encrypted zip are not supported");
+ }
+
+ // will be read in the local part, see the comments there
+ reader.skip(fileNameLength);
+ this.readExtraFields(reader);
+ this.parseZIP64ExtraField(reader);
+ this.fileComment = reader.readData(this.fileCommentLength);
+ },
+
+ /**
+ * Parse the external file attributes and get the unix/dos permissions.
+ */
+ processAttributes: function () {
+ this.unixPermissions = null;
+ this.dosPermissions = null;
+ var madeBy = this.versionMadeBy >> 8;
+
+ // Check if we have the DOS directory flag set.
+ // We look for it in the DOS and UNIX permissions
+ // but some unknown platform could set it as a compatibility flag.
+ this.dir = this.externalFileAttributes & 0x0010 ? true : false;
+
+ if(madeBy === MADE_BY_DOS) {
+ // first 6 bits (0 to 5)
+ this.dosPermissions = this.externalFileAttributes & 0x3F;
+ }
+
+ if(madeBy === MADE_BY_UNIX) {
+ this.unixPermissions = (this.externalFileAttributes >> 16) & 0xFFFF;
+ // the octal permissions are in (this.unixPermissions & 0x01FF).toString(8);
+ }
+
+ // fail safe : if the name ends with a / it probably means a folder
+ if (!this.dir && this.fileNameStr.slice(-1) === "/") {
+ this.dir = true;
+ }
+ },
+
+ /**
+ * Parse the ZIP64 extra field and merge the info in the current ZipEntry.
+ * @param {DataReader} reader the reader to use.
+ */
+ parseZIP64ExtraField: function() {
+ if (!this.extraFields[0x0001]) {
+ return;
+ }
+
+ // should be something, preparing the extra reader
+ var extraReader = readerFor(this.extraFields[0x0001].value);
+
+ // I really hope that these 64bits integer can fit in 32 bits integer, because js
+ // won't let us have more.
+ if (this.uncompressedSize === utils.MAX_VALUE_32BITS) {
+ this.uncompressedSize = extraReader.readInt(8);
+ }
+ if (this.compressedSize === utils.MAX_VALUE_32BITS) {
+ this.compressedSize = extraReader.readInt(8);
+ }
+ if (this.localHeaderOffset === utils.MAX_VALUE_32BITS) {
+ this.localHeaderOffset = extraReader.readInt(8);
+ }
+ if (this.diskNumberStart === utils.MAX_VALUE_32BITS) {
+ this.diskNumberStart = extraReader.readInt(4);
+ }
+ },
+ /**
+ * Read the central part of a zip file and add the info in this object.
+ * @param {DataReader} reader the reader to use.
+ */
+ readExtraFields: function(reader) {
+ var end = reader.index + this.extraFieldsLength,
+ extraFieldId,
+ extraFieldLength,
+ extraFieldValue;
+
+ if (!this.extraFields) {
+ this.extraFields = {};
+ }
+
+ while (reader.index + 4 < end) {
+ extraFieldId = reader.readInt(2);
+ extraFieldLength = reader.readInt(2);
+ extraFieldValue = reader.readData(extraFieldLength);
+
+ this.extraFields[extraFieldId] = {
+ id: extraFieldId,
+ length: extraFieldLength,
+ value: extraFieldValue
+ };
+ }
+
+ reader.setIndex(end);
+ },
+ /**
+ * Apply an UTF8 transformation if needed.
+ */
+ handleUTF8: function() {
+ var decodeParamType = support.uint8array ? "uint8array" : "array";
+ if (this.useUTF8()) {
+ this.fileNameStr = utf8.utf8decode(this.fileName);
+ this.fileCommentStr = utf8.utf8decode(this.fileComment);
+ } else {
+ var upath = this.findExtraFieldUnicodePath();
+ if (upath !== null) {
+ this.fileNameStr = upath;
+ } else {
+ // ASCII text or unsupported code page
+ var fileNameByteArray = utils.transformTo(decodeParamType, this.fileName);
+ this.fileNameStr = this.loadOptions.decodeFileName(fileNameByteArray);
+ }
+
+ var ucomment = this.findExtraFieldUnicodeComment();
+ if (ucomment !== null) {
+ this.fileCommentStr = ucomment;
+ } else {
+ // ASCII text or unsupported code page
+ var commentByteArray = utils.transformTo(decodeParamType, this.fileComment);
+ this.fileCommentStr = this.loadOptions.decodeFileName(commentByteArray);
+ }
+ }
+ },
+
+ /**
+ * Find the unicode path declared in the extra field, if any.
+ * @return {String} the unicode path, null otherwise.
+ */
+ findExtraFieldUnicodePath: function() {
+ var upathField = this.extraFields[0x7075];
+ if (upathField) {
+ var extraReader = readerFor(upathField.value);
+
+ // wrong version
+ if (extraReader.readInt(1) !== 1) {
+ return null;
+ }
+
+ // the crc of the filename changed, this field is out of date.
+ if (crc32fn(this.fileName) !== extraReader.readInt(4)) {
+ return null;
+ }
+
+ return utf8.utf8decode(extraReader.readData(upathField.length - 5));
+ }
+ return null;
+ },
+
+ /**
+ * Find the unicode comment declared in the extra field, if any.
+ * @return {String} the unicode comment, null otherwise.
+ */
+ findExtraFieldUnicodeComment: function() {
+ var ucommentField = this.extraFields[0x6375];
+ if (ucommentField) {
+ var extraReader = readerFor(ucommentField.value);
+
+ // wrong version
+ if (extraReader.readInt(1) !== 1) {
+ return null;
+ }
+
+ // the crc of the comment changed, this field is out of date.
+ if (crc32fn(this.fileComment) !== extraReader.readInt(4)) {
+ return null;
+ }
+
+ return utf8.utf8decode(extraReader.readData(ucommentField.length - 5));
+ }
+ return null;
+ }
+};
+module.exports = ZipEntry;
+
+},{"./compressedObject":2,"./compressions":3,"./crc32":4,"./reader/readerFor":22,"./support":30,"./utf8":31,"./utils":32}],35:[function(require,module,exports){
+"use strict";
+
+var StreamHelper = require("./stream/StreamHelper");
+var DataWorker = require("./stream/DataWorker");
+var utf8 = require("./utf8");
+var CompressedObject = require("./compressedObject");
+var GenericWorker = require("./stream/GenericWorker");
+
+/**
+ * A simple object representing a file in the zip file.
+ * @constructor
+ * @param {string} name the name of the file
+ * @param {String|ArrayBuffer|Uint8Array|Buffer} data the data
+ * @param {Object} options the options of the file
+ */
+var ZipObject = function(name, data, options) {
+ this.name = name;
+ this.dir = options.dir;
+ this.date = options.date;
+ this.comment = options.comment;
+ this.unixPermissions = options.unixPermissions;
+ this.dosPermissions = options.dosPermissions;
+
+ this._data = data;
+ this._dataBinary = options.binary;
+ // keep only the compression
+ this.options = {
+ compression : options.compression,
+ compressionOptions : options.compressionOptions
+ };
+};
+
+ZipObject.prototype = {
+ /**
+ * Create an internal stream for the content of this object.
+ * @param {String} type the type of each chunk.
+ * @return StreamHelper the stream.
+ */
+ internalStream: function (type) {
+ var result = null, outputType = "string";
+ try {
+ if (!type) {
+ throw new Error("No output type specified.");
+ }
+ outputType = type.toLowerCase();
+ var askUnicodeString = outputType === "string" || outputType === "text";
+ if (outputType === "binarystring" || outputType === "text") {
+ outputType = "string";
+ }
+ result = this._decompressWorker();
+
+ var isUnicodeString = !this._dataBinary;
+
+ if (isUnicodeString && !askUnicodeString) {
+ result = result.pipe(new utf8.Utf8EncodeWorker());
+ }
+ if (!isUnicodeString && askUnicodeString) {
+ result = result.pipe(new utf8.Utf8DecodeWorker());
+ }
+ } catch (e) {
+ result = new GenericWorker("error");
+ result.error(e);
+ }
+
+ return new StreamHelper(result, outputType, "");
+ },
+
+ /**
+ * Prepare the content in the asked type.
+ * @param {String} type the type of the result.
+ * @param {Function} onUpdate a function to call on each internal update.
+ * @return Promise the promise of the result.
+ */
+ async: function (type, onUpdate) {
+ return this.internalStream(type).accumulate(onUpdate);
+ },
+
+ /**
+ * Prepare the content as a nodejs stream.
+ * @param {String} type the type of each chunk.
+ * @param {Function} onUpdate a function to call on each internal update.
+ * @return Stream the stream.
+ */
+ nodeStream: function (type, onUpdate) {
+ return this.internalStream(type || "nodebuffer").toNodejsStream(onUpdate);
+ },
+
+ /**
+ * Return a worker for the compressed content.
+ * @private
+ * @param {Object} compression the compression object to use.
+ * @param {Object} compressionOptions the options to use when compressing.
+ * @return Worker the worker.
+ */
+ _compressWorker: function (compression, compressionOptions) {
+ if (
+ this._data instanceof CompressedObject &&
+ this._data.compression.magic === compression.magic
+ ) {
+ return this._data.getCompressedWorker();
+ } else {
+ var result = this._decompressWorker();
+ if(!this._dataBinary) {
+ result = result.pipe(new utf8.Utf8EncodeWorker());
+ }
+ return CompressedObject.createWorkerFrom(result, compression, compressionOptions);
+ }
+ },
+ /**
+ * Return a worker for the decompressed content.
+ * @private
+ * @return Worker the worker.
+ */
+ _decompressWorker : function () {
+ if (this._data instanceof CompressedObject) {
+ return this._data.getContentWorker();
+ } else if (this._data instanceof GenericWorker) {
+ return this._data;
+ } else {
+ return new DataWorker(this._data);
+ }
+ }
+};
+
+var removedMethods = ["asText", "asBinary", "asNodeBuffer", "asUint8Array", "asArrayBuffer"];
+var removedFn = function () {
+ throw new Error("This method has been removed in JSZip 3.0, please check the upgrade guide.");
+};
+
+for(var i = 0; i < removedMethods.length; i++) {
+ ZipObject.prototype[removedMethods[i]] = removedFn;
+}
+module.exports = ZipObject;
+
+},{"./compressedObject":2,"./stream/DataWorker":27,"./stream/GenericWorker":28,"./stream/StreamHelper":29,"./utf8":31}],36:[function(require,module,exports){
+(function (global){
+'use strict';
+var Mutation = global.MutationObserver || global.WebKitMutationObserver;
+
+var scheduleDrain;
+
+{
+ if (Mutation) {
+ var called = 0;
+ var observer = new Mutation(nextTick);
+ var element = global.document.createTextNode('');
+ observer.observe(element, {
+ characterData: true
+ });
+ scheduleDrain = function () {
+ element.data = (called = ++called % 2);
+ };
+ } else if (!global.setImmediate && typeof global.MessageChannel !== 'undefined') {
+ var channel = new global.MessageChannel();
+ channel.port1.onmessage = nextTick;
+ scheduleDrain = function () {
+ channel.port2.postMessage(0);
+ };
+ } else if ('document' in global && 'onreadystatechange' in global.document.createElement('script')) {
+ scheduleDrain = function () {
+
+ // Create a