diff --git a/core/prisma/schema.prisma b/core/prisma/schema.prisma index 6470d0040..3e2fd3535 100644 --- a/core/prisma/schema.prisma +++ b/core/prisma/schema.prisma @@ -16,6 +16,7 @@ generator sync { client_format = "folder" } +/// @local model CRDTOperation { id Int @id @default(autoincrement()) @@ -33,6 +34,7 @@ model CRDTOperation { @@map("crdt_operation") } +/// @local model CloudCRDTOperation { id Int @id @default(autoincrement()) @@ -51,6 +53,7 @@ model CloudCRDTOperation { } /// @deprecated: This model has to exist solely for backwards compatibility. +/// @local model Node { id Int @id @default(autoincrement()) pub_id Bytes @unique @@ -65,6 +68,7 @@ model Node { // represents a single `.db` file (SQLite DB) that is paired to the current library. // A `LibraryInstance` is always owned by a single `Node` but it's possible for that node to change (or two to be owned by a single node). +/// @local model Instance { id Int @id @default(autoincrement()) // This is is NOT globally unique pub_id Bytes @unique // This UUID is meaningless and exists soley cause the `uhlc::ID` must be 16-bit. Really this should be derived from the `identity` field. @@ -89,6 +93,7 @@ model Instance { @@map("instance") } +/// @local model Statistics { id Int @id @default(autoincrement()) date_captured DateTime @default(now()) diff --git a/docs/developers/architecture/sync.mdx b/docs/developers/architecture/sync.mdx index 587f6db7d..bf298f737 100644 --- a/docs/developers/architecture/sync.mdx +++ b/docs/developers/architecture/sync.mdx @@ -4,8 +4,8 @@ index: 12 # Sync -Spacedrive synchronizes data using a combination of master-slave replication and last-write-wins CRDTs, -with the synchronization method encoded into the Prisma schema using [record type attributes](#record-types). +Spacedrive synchronizes libraries by treating SQLite cells as last-write-wins CRDTs, +with sync metadata about each model being encoded into Prisma schema using [model attributes](#model-types). In the cases where LWW CRDTs are used, conflicts are resolved using a [Hybrid Logical Clock](https://github.com/atolab/uhlc-rs) @@ -14,12 +14,12 @@ to determine the ordering of events. We would be remiss to not credit credit [Actual Budget](https://actualbudget.com/) with many of the CRDT concepts used in Spacedrive's sync system. -## Record Types +## Model Types All data in a library conforms to one of the following types. Each type uses a different strategy for syncing. -### Local Records +### Local Local records exist entirely outside of the sync system. They don't have Sync IDs and never leave the node they were created on. @@ -28,63 +28,71 @@ Used for Nodes, Statistics, and Sync Events. `@local` -### Owned Records - -Owned records are only ever modified by the node they are created by, -so they can be synced in a master-slave fashion. -The creator of an owned record dictates the state of the record to other nodes, -who will simply accept new changes without considering conflicts. - -File paths are owned records since they only exist on one node, -and that node can inform all other nodes about the correct state of the paths. - -Used for Locations, Paths, and Volumes. - -`@owned(owner: String, id?: String)` - -- `owner` - Field that identifies the owner of this model. - If a scalar, will directly use that value in sync operations. - If a relation, the Sync ID of the related model will be resolved for sync operations. -- `id` - Scalar field to override the default Sync ID. - -### Shared Records +### Shared Shared records encompass most data synced in the CRDT fashion. Updates are applied per-field using a last-write-wins strategy. Used for Objects, Tags, Spaces, and Jobs. -`@shared(create: SharedCreateType, id?: String)` +`@shared(id?: String, modelId: Int)` - `id` - Scalar field to override the default Sync ID. -- `create` - How the model should be created. - - `Unique` (default): Model can be created with many required arguemnts, - but ID provided _must_ be unique across all nodes. - Useful for Tags since their IDs are non-deterministic. - - `Atomic`: Require the model to have no required arguments apart from ID and apply all create arguments as atomic updates. - Necessary for models with the same ID that can be created on multiple nodes. - Useful for Objects since their ID is dependent on their content, - and could be the same across nodes. +- `modelId` - Integer to identify the model by. Helps save on bandwidth and storage as opposed to storing model names as strings. -### Relation Records +### Relation Similar to shared records, but represent a many-to-many relation between two records. Sync ID is the combination of `item` and `group` Sync IDs. Used for TagOnFile and FileInSpace. -`@relation(item: String, group: String)` +`@relation(item: String, group: String, modelId: Int)` - `item` - Field that identifies the item that the relation is connecting. - Similar to the `owner` argument of `@owned`. - `group` - Field that identifies the group that the item should be connected to. - Similar to the `owner` argument of `@owned`. + - `modelId` - Integer to identify the model by. Helps save on bandwidth and storage as opposed to storing model names as strings. -## Other Prisma Attributes +## Sync Actors -`@node` +The sync system is comprised of a number of actors that send, receive, and ingest sync operations. -Indicates that a relation field should be set to the current node. -This could be done manually, -but `@node` allows `node_id` fields to be resolved from the `node_id` field of a `CRDTOperation`, -saving on bandwidth +### Ingest + +The most important of these is the +[ingest actor](https://github.com/spacedriveapp/spacedrive/blob/main/core/crates/sync/src/ingest.rs). +Its existence entirely independent of both the core and the cloud sync actors allows it to be interacted with by many different systems, +while ensuring that only one process is responsible for providing it with operations to ingest at a time. + +Its work loop goes something like this: + +- Wait for a notification that new sync operations are available to ingest +- Request new operations from whatever process holds the communication channel with the ingester. + - This request is accompanied by timestamps of the last operation ingested for each instance of the library, to be used to fetch only the latest necessary sync operations. + - If the communication channel is dropped, the work loop will restart +- Ingested the sync operations in batches distinguished by instance, model, and record id + - If the batch contains a `Delete`, all other operations are ignored and the record is deleted. + - If the batch contains a `Create`, all `Update` operations are applied on top of it to reduce database load. + - If the batch only contains `Update` operations, they are applied on top of a fake `Create` operation to reduce datbase load. + - The latest timestamp of the instance is updated to the timestamp of the last operation ingested. +- Return to the beginning, waiting for a notification of new operations + +### Cloud Send/Receive/Ingest + +Each of these play a different role in getting sync operations to and from the cloud, and into the ingest actor. + +#### [Send](https://github.com/spacedriveapp/spacedrive/blob/main/core/src/cloud/sync/send.rs) + +When new sync operations are created on an instance will attempt to obtain a lock on cloud sync operations for that particular instance, +and will upload them as a compressed base64 format. + +The lock on cloud sync operations is necessary to prevent multiple processes from attempting to upload operations at the same time, +and is implemented as a short-expiry redis entry. + +#### [Receive](https://github.com/spacedriveapp/spacedrive/blob/main/core/src/cloud/sync/receive.rs) + +Downloads sync operations from the cloud periodically, and stores them in the `cloud_crdt_operation` table. + +#### [Ingest](https://github.com/spacedriveapp/spacedrive/blob/main/core/src/cloud/sync/ingest.rs) + +Reads sync operations from the `cloud_crdt_operation` table, and sends them to the ingest actor. diff --git a/docs/product/guides/database-sync.mdx b/docs/product/guides/database-sync.mdx index 75693f0f3..f4d2a1821 100644 --- a/docs/product/guides/database-sync.mdx +++ b/docs/product/guides/database-sync.mdx @@ -2,4 +2,21 @@ index: 100 --- -# Database sync +# Database Sync + +Spacedrive Libraries are just databases shared among multiple instances. When these instances connect, either via Spacedrive Cloud or peer-to-peer, the changes to each instance's local database are synchronized. +The technical details of this process are covered in the [Developer docs](/docs/developers/architecture/sync). + +## Cloud Sync + +When connected to Spacedrive Cloud, a Spacedrive Instance will send its changes to be stored in the Cloud in an encrypted format. +When another instance connects to the Cloud, it will download these changes and apply them to its local database. + +This process does not require multiple instances to be connected at the same time. +Spacedrive Cloud will store the changes for when other instances connect. + +## Peer-To-Peer Sync + +When connected over peer-to-peer, two instances will exchange their database changess directly. +The contents of the database changes are not encrypted themselves, +instead the connection between the two instances will ensure that they are encrypted during transit.