diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..d0ecdd1 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "makefile.configureOnOpen": true +} \ No newline at end of file diff --git a/README.md b/README.md index c763c70..62f0203 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,7 @@ This layer allows you to run DuckDB queries without writing any code or deployin ## Current status - ๐Ÿงช Experimental -- ๐Ÿ“‘ Limited documentation -- ๐Ÿ’ƒ Working under tests to date +- ๐Ÿ“ฑ Arm64 only ## Features @@ -18,6 +17,43 @@ This layer allows you to run DuckDB queries without writing any code or deployin - **Data Transformation**: Perform complex data transformations using SQL queries. - **Integration with Step Functions**: Easily integrate with AWS Step Functions for orchestrating data workflows. +## Getting Started + +### Adding the Lambda Layer to your AWS account + +The Lambda Layer for this DuckDB runtime is available in the AWS Serverless Application Repository. You can deploy it directly from the AWS Management Console or using the AWS CLI. + +- Install from the AWS Console: https://serverlessrepo.aws.amazon.com/applications/eu-west-1/949339270388/duck-query-lambda +- Install using AWS SAM or CloudFormation: +```yaml + duckquerylambda: + Type: AWS::Serverless::Application + Properties: + Location: + ApplicationId: arn:aws:serverlessrepo:eu-west-1:949339270388:applications/duck-query-lambda + SemanticVersion: 0.0.7 # x-release-please-version +``` + +- Install using the AWS CDK: +```typescript +import * as sam from "aws-cdk-lib/aws-sam"; +... + + new sam.CfnApplication(this, "DuckQueryRuntimeLayer", { + location: { + applicationId: "arn:aws:serverlessrepo:eu-west-1:949339270388:applications/duck-query-lambda", + semanticVersion: "0.0.7", // x-release-please-version + }, + }); +``` + +An example SAM project can be found in the [`examples/`](./examples/) directory. + +### Creating a Lambda function using the DuckDB runtime + +You don't need to write any code to use the DuckDB runtime. You can create a Lambda function that uses the runtime, give it some IAM permissions and then invoke it with a query. + + ## Usage ### Prerequisites @@ -71,8 +107,9 @@ This layer allows you to run DuckDB queries without writing any code or deployin } ``` -### Example +## Examples +### Using the Lambda Layer in an AWS Step Function Here is an example of how to use the Lambda Layer in an AWS Step Function: ```json @@ -88,4 +125,18 @@ Here is an example of how to use the Lambda Layer in an AWS Step Function: "End": true } } -} \ No newline at end of file +} +``` + +### Invoking the Lambda Function and getting the query results back synchronously + +By default, the Lambda function will not return the query results. This is because it's not trivial to convert all results back to JSON in a way that every user expects. If you do want to get the results back synchronously, you can write them to a temporary file in the Lambda and the function will then return the contents of that file, base64 encoded. + +Here is an example of how to do this: + +```json +{ + "query": "COPY (SELECT * FROM 'https://github.com/Teradata/kylo/raw/refs/heads/master/samples/sample-data/parquet/userdata1.parquet' LIMIT 10) TO '/tmp/output.json'", + "outputFile": "/tmp/output.json" +} +``` diff --git a/examples/sam/.gitignore b/examples/sam/.gitignore new file mode 100644 index 0000000..4808264 --- /dev/null +++ b/examples/sam/.gitignore @@ -0,0 +1,244 @@ + +# Created by https://www.gitignore.io/api/osx,linux,python,windows,pycharm,visualstudiocode + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### OSX ### +*.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### PyCharm ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/dictionaries + +# Sensitive or high-churn files: +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.xml +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml + +# Gradle: +.idea/**/gradle.xml +.idea/**/libraries + +# CMake +cmake-build-debug/ + +# Mongo Explorer plugin: +.idea/**/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Ruby plugin and RubyMine +/.rakeTasks + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +### PyCharm Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +# Sonarlint plugin +.idea/sonarlint + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +.pytest_cache/ +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule.* + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +.history + +### Windows ### +# Windows thumbnail cache files +Thumbs.db +ehthumbs.db +ehthumbs_vista.db + +# Folder config file +Desktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msm +*.msp + +# Windows shortcuts +*.lnk + +# Build folder + +*/build/* + +# End of https://www.gitignore.io/api/osx,linux,python,windows,pycharm,visualstudiocode \ No newline at end of file diff --git a/examples/sam/README.md b/examples/sam/README.md new file mode 100644 index 0000000..228875c --- /dev/null +++ b/examples/sam/README.md @@ -0,0 +1,130 @@ +# duck-query + +This project contains source code and supporting files for a serverless application that you can deploy with the SAM CLI. It includes the following files and folders. + +- hello_world - Code for the application's Lambda function. +- events - Invocation events that you can use to invoke the function. +- tests - Unit tests for the application code. +- template.yaml - A template that defines the application's AWS resources. + +The application uses several AWS resources, including Lambda functions and an API Gateway API. These resources are defined in the `template.yaml` file in this project. You can update the template to add AWS resources through the same deployment process that updates your application code. + +If you prefer to use an integrated development environment (IDE) to build and test your application, you can use the AWS Toolkit. +The AWS Toolkit is an open source plug-in for popular IDEs that uses the SAM CLI to build and deploy serverless applications on AWS. The AWS Toolkit also adds a simplified step-through debugging experience for Lambda function code. See the following links to get started. + +* [CLion](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [GoLand](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [IntelliJ](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [WebStorm](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [Rider](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [PhpStorm](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [PyCharm](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [RubyMine](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [DataGrip](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [VS Code](https://docs.aws.amazon.com/toolkit-for-vscode/latest/userguide/welcome.html) +* [Visual Studio](https://docs.aws.amazon.com/toolkit-for-visual-studio/latest/user-guide/welcome.html) + +## Deploy the sample application + +The Serverless Application Model Command Line Interface (SAM CLI) is an extension of the AWS CLI that adds functionality for building and testing Lambda applications. It uses Docker to run your functions in an Amazon Linux environment that matches Lambda. It can also emulate your application's build environment and API. + +To use the SAM CLI, you need the following tools. + +* SAM CLI - [Install the SAM CLI](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-install.html) +* [Python 3 installed](https://www.python.org/downloads/) +* Docker - [Install Docker community edition](https://hub.docker.com/search/?type=edition&offering=community) + +To build and deploy your application for the first time, run the following in your shell: + +```bash +sam build --use-container +sam deploy --guided +``` + +The first command will build the source of your application. The second command will package and deploy your application to AWS, with a series of prompts: + +* **Stack Name**: The name of the stack to deploy to CloudFormation. This should be unique to your account and region, and a good starting point would be something matching your project name. +* **AWS Region**: The AWS region you want to deploy your app to. +* **Confirm changes before deploy**: If set to yes, any change sets will be shown to you before execution for manual review. If set to no, the AWS SAM CLI will automatically deploy application changes. +* **Allow SAM CLI IAM role creation**: Many AWS SAM templates, including this example, create AWS IAM roles required for the AWS Lambda function(s) included to access AWS services. By default, these are scoped down to minimum required permissions. To deploy an AWS CloudFormation stack which creates or modifies IAM roles, the `CAPABILITY_IAM` value for `capabilities` must be provided. If permission isn't provided through this prompt, to deploy this example you must explicitly pass `--capabilities CAPABILITY_IAM` to the `sam deploy` command. +* **Save arguments to samconfig.toml**: If set to yes, your choices will be saved to a configuration file inside the project, so that in the future you can just re-run `sam deploy` without parameters to deploy changes to your application. + +You can find your API Gateway Endpoint URL in the output values displayed after deployment. + +## Use the SAM CLI to build and test locally + +Build your application with the `sam build --use-container` command. + +```bash +duck-query$ sam build --use-container +``` + +The SAM CLI installs dependencies defined in `hello_world/requirements.txt`, creates a deployment package, and saves it in the `.aws-sam/build` folder. + +Test a single function by invoking it directly with a test event. An event is a JSON document that represents the input that the function receives from the event source. Test events are included in the `events` folder in this project. + +Run functions locally and invoke them with the `sam local invoke` command. + +```bash +duck-query$ sam local invoke HelloWorldFunction --event events/event.json +``` + +The SAM CLI can also emulate your application's API. Use the `sam local start-api` to run the API locally on port 3000. + +```bash +duck-query$ sam local start-api +duck-query$ curl http://localhost:3000/ +``` + +The SAM CLI reads the application template to determine the API's routes and the functions that they invoke. The `Events` property on each function's definition includes the route and method for each path. + +```yaml + Events: + HelloWorld: + Type: Api + Properties: + Path: /hello + Method: get +``` + +## Add a resource to your application +The application template uses AWS Serverless Application Model (AWS SAM) to define application resources. AWS SAM is an extension of AWS CloudFormation with a simpler syntax for configuring common serverless application resources such as functions, triggers, and APIs. For resources not included in [the SAM specification](https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md), you can use standard [AWS CloudFormation](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-template-resource-type-ref.html) resource types. + +## Fetch, tail, and filter Lambda function logs + +To simplify troubleshooting, SAM CLI has a command called `sam logs`. `sam logs` lets you fetch logs generated by your deployed Lambda function from the command line. In addition to printing the logs on the terminal, this command has several nifty features to help you quickly find the bug. + +`NOTE`: This command works for all AWS Lambda functions; not just the ones you deploy using SAM. + +```bash +duck-query$ sam logs -n HelloWorldFunction --stack-name "duck-query" --tail +``` + +You can find more information and examples about filtering Lambda function logs in the [SAM CLI Documentation](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-logging.html). + +## Tests + +Tests are defined in the `tests` folder in this project. Use PIP to install the test dependencies and run tests. + +```bash +duck-query$ pip install -r tests/requirements.txt --user +# unit test +duck-query$ python -m pytest tests/unit -v +# integration test, requiring deploying the stack first. +# Create the env variable AWS_SAM_STACK_NAME with the name of the stack we are testing +duck-query$ AWS_SAM_STACK_NAME="duck-query" python -m pytest tests/integration -v +``` + +## Cleanup + +To delete the sample application that you created, use the AWS CLI. Assuming you used your project name for the stack name, you can run the following: + +```bash +sam delete --stack-name "duck-query" +``` + +## Resources + +See the [AWS SAM developer guide](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/what-is-sam.html) for an introduction to SAM specification, the SAM CLI, and serverless application concepts. + +Next, you can use AWS Serverless Application Repository to deploy ready to use Apps that go beyond hello world samples and learn how authors developed their applications: [AWS Serverless Application Repository main page](https://aws.amazon.com/serverless/serverlessrepo/) diff --git a/examples/sam/samconfig.toml b/examples/sam/samconfig.toml new file mode 100644 index 0000000..c6d7ba3 --- /dev/null +++ b/examples/sam/samconfig.toml @@ -0,0 +1,33 @@ +# More information about the configuration file can be found here: +# https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-config.html +version = 0.1 + +[default.global.parameters] +stack_name = "duck-query" + +[default.build.parameters] +cached = true +parallel = true + +[default.validate.parameters] +lint = true + +[default.deploy.parameters] +capabilities = "CAPABILITY_IAM CAPABILITY_AUTO_EXPAND" +confirm_changeset = true +resolve_s3 = true +s3_prefix = "duck-query" +region = "eu-west-1" +image_repositories = [] + +[default.package.parameters] +resolve_s3 = true + +[default.sync.parameters] +watch = true + +[default.local_start_api.parameters] +warm_containers = "EAGER" + +[default.local_start_lambda.parameters] +warm_containers = "EAGER" diff --git a/examples/sam/template.yaml b/examples/sam/template.yaml new file mode 100644 index 0000000..831593f --- /dev/null +++ b/examples/sam/template.yaml @@ -0,0 +1,36 @@ +AWSTemplateFormatVersion: '2010-09-09' +Transform: AWS::Serverless-2016-10-31 +Description: > + duck-query + + Lambda function for querying DuckDB with the DuckDB custom runtime + +Globals: + Function: + Timeout: 60 + +Resources: + DuckQueryRuntimeLayer: + Type: AWS::Serverless::Application + Properties: + Location: + ApplicationId: arn:aws:serverlessrepo:eu-west-1:949339270388:applications/duck-query-lambda + SemanticVersion: 0.0.7 + + DuckQueryFunction: + Type: AWS::Serverless::Function + Properties: + Runtime: provided.al2023 + Handler: bootstrap + Layers: + - !GetAtt DuckQueryRuntimeLayer.Outputs.DuckQueryLayerVersionArn + Architectures: + - arm64 + + +Outputs: + DuckQueryFunctionArn: + Description: ARN for the DuckQueryFunction + Value: !GetAtt DuckQueryFunction.Arn + Export: + Name: !Sub 'DuckQueryFunctionArn-${AWS::StackName}' \ No newline at end of file