diff --git a/.trae/documents/添加order_item与product的join关系.md b/.trae/documents/添加order_item与product的join关系.md new file mode 100644 index 0000000..c722df2 --- /dev/null +++ b/.trae/documents/添加order_item与product的join关系.md @@ -0,0 +1,37 @@ +## 实施计划 + +### 1. 问题分析 +- 错误:ER_CANT_AGGREGATE_2COLLATIONS - 字符集排序规则不匹配 +- 原因:order_item表使用utf8mb4_0900_ai_ci,product表使用utf8mb4_unicode_ci +- 解决方案:在JOIN条件中显式指定排序规则,或修改JOIN方式 + +### 2. 调整join配置 +在order_item.yml中修改与product的join配置,添加字符集转换函数,确保排序规则匹配: + +```yaml +joins: + - name: product + sql: "CONVERT({product.siteSkus} USING utf8mb4) COLLATE utf8mb4_unicode_ci LIKE CONCAT('%', {CUBE}.sku, '%')" + relationship: many_to_one +``` + +### 3. 优化join逻辑 +考虑到siteSkus可能是JSON格式或包含多个SKU,使用更可靠的JSON函数处理(如果数据库支持): + +```yaml +joins: + - name: product + sql: "JSON_CONTAINS({product.siteSkus}, JSON_QUOTE({CUBE}.sku))" + relationship: many_to_one +``` + +### 4. 实施步骤 +1. 编辑order_item.yml文件,修改product join配置 +2. 选择合适的join条件: + - 如果siteSkus是逗号分隔列表:使用CONVERT函数转换排序规则 + - 如果siteSkus是JSON数组:使用JSON_CONTAINS函数 +3. 验证查询文件order_item_product_by_sku_quantity_sum.json能够正确执行 +4. 确保查询结果显示product相关信息和对应的销售数量 + +### 5. 预期效果 +添加正确的join后,查询将能够关联order_item和product的数据,计算出每个product的实际销售数量(基于order_item.quantity) \ No newline at end of file diff --git a/container/cube/conf/cube.js b/container/cube/conf/cube.js deleted file mode 100644 index 3587701..0000000 --- a/container/cube/conf/cube.js +++ /dev/null @@ -1,5 +0,0 @@ -module.exports = { - dbType: 'mysql', - apiSecret: 'your-secret-here', - webSockets: true -}; \ No newline at end of file diff --git a/container/cube/conf/model/cubes/order_item.yml b/container/cube/conf/model/cubes/order_item.yml index e8d81fa..d42a320 100644 --- a/container/cube/conf/model/cubes/order_item.yml +++ b/container/cube/conf/model/cubes/order_item.yml @@ -4,6 +4,9 @@ cubes: data_source: default joins: + - name: product + sql: "CONVERT({product.siteskus} USING utf8mb4) COLLATE utf8mb4_unicode_ci LIKE CONCAT('%', {CUBE}.sku, '%')" + relationship: many_to_one - name: order sql: "{CUBE}.`orderId` = {order.id}" relationship: many_to_one diff --git a/container/cube/conf/model/cubes/product_attributes_dict_item.yml b/container/cube/conf/model/cubes/product_attributes_dict_item.yml index e08595a..18b4d12 100644 --- a/container/cube/conf/model/cubes/product_attributes_dict_item.yml +++ b/container/cube/conf/model/cubes/product_attributes_dict_item.yml @@ -5,18 +5,18 @@ cubes: joins: - name: dict_item - sql: "{CUBE}.dictItemId = {dict_item.id}" + sql: "{CUBE}.`dictItemId` = {dict_item.id}" + relationship: many_to_one + + - name: product_v2_attributes_dict_item + sql: "{CUBE}.`dictItemId` = {product_v2_attributes_dict_item}.`dictItemId`" relationship: many_to_one - name: product - sql: "{CUBE}.productId = {product.id}" + sql: "{CUBE}.`productId` = {product.id}" relationship: many_to_one - dimensions: - - name: dictItemId - sql: dictItemId - type: number - primary_key: true + dimensions: [] measures: - name: count diff --git a/container/cube/conf/model/cubes/product_v2_attributes_dict_item.yml b/container/cube/conf/model/cubes/product_v2_attributes_dict_item.yml index f6f681b..71b6329 100644 --- a/container/cube/conf/model/cubes/product_v2_attributes_dict_item.yml +++ b/container/cube/conf/model/cubes/product_v2_attributes_dict_item.yml @@ -5,18 +5,18 @@ cubes: joins: - name: dict_item - sql: "{CUBE}.dictItemId = {dict_item.id}" + sql: "{CUBE}.`dictItemId` = {dict_item.id}" + relationship: many_to_one + + - name: product_attributes_dict_item + sql: "{CUBE}.`dictItemId` = {product_attributes_dict_item}.`dictItemId`" relationship: many_to_one - name: product_v2 - sql: "{CUBE}.productV2Id = {product_v2.id}" + sql: "{CUBE}.`productV2Id` = {product_v2.id}" relationship: many_to_one - dimensions: - - name: dictItemId - sql: dictItemId - type: number - primary_key: true + dimensions: [] measures: - name: count diff --git a/container/cube/conf/model/cubes/site_areas_area.yml b/container/cube/conf/model/cubes/site_areas_area.yml index ea03e78..04cd3ce 100644 --- a/container/cube/conf/model/cubes/site_areas_area.yml +++ b/container/cube/conf/model/cubes/site_areas_area.yml @@ -5,18 +5,14 @@ cubes: joins: - name: area - sql: "{CUBE}.areaId = {area.id}" + sql: "{CUBE}.`areaId` = {area.id}" relationship: many_to_one - name: site - sql: "{CUBE}.siteId = {site.id}" + sql: "{CUBE}.`siteId` = {site.id}" relationship: many_to_one - dimensions: - - name: areaId - sql: areaId - type: number - primary_key: true + dimensions: [] measures: - name: count diff --git a/container/cube/conf/model/cubes/site_stock_points_stock_point.yml b/container/cube/conf/model/cubes/site_stock_points_stock_point.yml index 16d296b..851fab3 100644 --- a/container/cube/conf/model/cubes/site_stock_points_stock_point.yml +++ b/container/cube/conf/model/cubes/site_stock_points_stock_point.yml @@ -5,18 +5,18 @@ cubes: joins: - name: site - sql: "{CUBE}.siteId = {site.id}" + sql: "{CUBE}.`siteId` = {site.id}" + relationship: many_to_one + + - name: site_v2_stock_points_stock_point + sql: "{CUBE}.`stockPointId` = {site_v2_stock_points_stock_point}.`stockPointId`" relationship: many_to_one - name: stock_point - sql: "{CUBE}.stockPointId = {stock_point.id}" + sql: "{CUBE}.`stockPointId` = {stock_point.id}" relationship: many_to_one - dimensions: - - name: stockPointId - sql: stockPointId - type: number - primary_key: true + dimensions: [] measures: - name: count diff --git a/container/cube/conf/model/cubes/site_v2_areas_area.yml b/container/cube/conf/model/cubes/site_v2_areas_area.yml index 9117b74..7885d3a 100644 --- a/container/cube/conf/model/cubes/site_v2_areas_area.yml +++ b/container/cube/conf/model/cubes/site_v2_areas_area.yml @@ -5,18 +5,14 @@ cubes: joins: - name: area - sql: "{CUBE}.areaId = {area.id}" + sql: "{CUBE}.`areaId` = {area.id}" relationship: many_to_one - name: site_v2 - sql: "{CUBE}.siteV2Id = {site_v2.id}" + sql: "{CUBE}.`siteV2Id` = {site_v2.id}" relationship: many_to_one - dimensions: - - name: areaId - sql: areaId - type: number - primary_key: true + dimensions: [] measures: - name: count diff --git a/container/cube/conf/model/cubes/site_v2_stock_points_stock_point.yml b/container/cube/conf/model/cubes/site_v2_stock_points_stock_point.yml index ed62bbe..7fe4d50 100644 --- a/container/cube/conf/model/cubes/site_v2_stock_points_stock_point.yml +++ b/container/cube/conf/model/cubes/site_v2_stock_points_stock_point.yml @@ -5,18 +5,18 @@ cubes: joins: - name: site_v2 - sql: "{CUBE}.siteV2Id = {site_v2.id}" + sql: "{CUBE}.`siteV2Id` = {site_v2.id}" + relationship: many_to_one + + - name: site_stock_points_stock_point + sql: "{CUBE}.`stockPointId` = {site_stock_points_stock_point}.`stockPointId`" relationship: many_to_one - name: stock_point - sql: "{CUBE}.stockPointId = {stock_point.id}" + sql: "{CUBE}.`stockPointId` = {stock_point.id}" relationship: many_to_one - dimensions: - - name: stockPointId - sql: stockPointId - type: number - primary_key: true + dimensions: [] measures: - name: count diff --git a/container/cube/conf/model/cubes/stock_point_areas_area.yml b/container/cube/conf/model/cubes/stock_point_areas_area.yml index fd96be7..31a8047 100644 --- a/container/cube/conf/model/cubes/stock_point_areas_area.yml +++ b/container/cube/conf/model/cubes/stock_point_areas_area.yml @@ -5,26 +5,22 @@ cubes: joins: - name: area - sql: "{CUBE}.areaId = {area.id}" + sql: "{CUBE}.`areaId` = {area.id}" relationship: many_to_one - name: site_stock_points_stock_point - sql: "{CUBE}.stockPointId = {site_stock_points_stock_point}.stockPointId" + sql: "{CUBE}.`stockPointId` = {site_stock_points_stock_point}.`stockPointId`" relationship: many_to_one - name: site_v2_stock_points_stock_point - sql: "{CUBE}.stockPointId = {site_v2_stock_points_stock_point}.stockPointId" + sql: "{CUBE}.`stockPointId` = {site_v2_stock_points_stock_point}.`stockPointId`" relationship: many_to_one - name: stock_point - sql: "{CUBE}.stockPointId = {stock_point.id}" + sql: "{CUBE}.`stockPointId` = {stock_point.id}" relationship: many_to_one - dimensions: - - name: areaId - sql: areaId - type: number - primary_key: true + dimensions: [] measures: - name: count diff --git a/container/cube/query/order_item_product_by_sku_quantity_sum.json b/container/cube/query/order_item_product_by_sku_quantity_sum.json index e69de29..d7ef017 100644 --- a/container/cube/query/order_item_product_by_sku_quantity_sum.json +++ b/container/cube/query/order_item_product_by_sku_quantity_sum.json @@ -0,0 +1,7 @@ +{ + "dimensions": ["product.sku"], + "measures": ["order_item.quantity"], + "order": { + "order_item.quantity": "desc" + } +} \ No newline at end of file diff --git a/container/cube/query/order_item_product_component_by_sku_quantity_sum.json b/container/cube/query/order_item_product_component_by_sku_quantity_sum.json index e69de29..c6c2205 100644 --- a/container/cube/query/order_item_product_component_by_sku_quantity_sum.json +++ b/container/cube/query/order_item_product_component_by_sku_quantity_sum.json @@ -0,0 +1,8 @@ +{ + "dimensions": ["product.sku"], + "measures": ["order_item.quantity"], + "order": { + "order_item.quantity": "desc" + }, + "filters": [] +} \ No newline at end of file diff --git a/container/datax/job/README.md b/container/datax/job/README.md deleted file mode 100644 index 7112e0d..0000000 --- a/container/datax/job/README.md +++ /dev/null @@ -1,102 +0,0 @@ -# DataX 作业配置例子 - -本目录包含了使用 DataX 从 RESTful API 读取数据并写入 MySQL 数据库的作业配置示例。 - -## 例子说明 - -### 1. 产品数据同步 (`restful_to_mysql_products.json`) - -从 Woocommerce API 获取产品数据,写入本地 MySQL 数据库的 `products` 表。 - -**主要配置项:** -- API URL: `https://your-woocommerce-site.com/wp-json/wc/v3/products` -- 认证方式: Basic Auth -- 分页方式: 基于页码的分页 -- 写入表: `products` -- 写入模式: `replace` (替换现有数据) - -### 2. 订单数据同步 (`restful_to_mysql_orders.json`) - -从 Woocommerce API 获取订单数据,写入本地 MySQL 数据库的 `orders` 表。 - -**主要配置项:** -- API URL: `https://your-woocommerce-site.com/wp-json/wc/v3/orders` -- 认证方式: Basic Auth -- 分页方式: 基于页码的分页 -- 写入表: `orders` -- 写入模式: `replace` (替换现有数据) - -## 使用方法 - -### 1. 配置修改 - -在运行作业前,需要根据实际情况修改以下配置: - -1. **API URL**: 将 `https://your-woocommerce-site.com` 替换为实际的 Woocommerce 站点 URL -2. **认证信息**: 更新 `Authorization` 头为实际的 Basic Auth 凭证 -3. **数据库连接**: 根据实际情况调整数据库名称、表名等 - -### 2. 运行作业 - -在 DataX 容器中运行作业: - -```bash -docker-compose exec datax python /datax/bin/datax.py /datax/job/restful_to_mysql_products.json -``` - -或 - -```bash -docker-compose exec datax python /datax/bin/datax.py /datax/job/restful_to_mysql_orders.json -``` - -### 3. 查看日志 - -作业日志会输出到容器的 `/datax/log` 目录,对应主机的 `./datax/log` 目录。 - -## 注意事项 - -1. **数据库表结构**: 请确保目标数据库中已创建相应的表结构,DataX 不会自动创建表 -2. **API 权限**: 确保 Woocommerce API 已启用,并具有相应的读取权限 -3. **分页配置**: 根据实际数据量调整 `pageSize` 和 `totalCount` 参数 -4. **写入模式**: 支持 `insert` (插入)、`update` (更新)、`replace` (替换) 等模式 -5. **数据类型**: 确保配置文件中的数据类型与数据库表结构匹配 - -## 表结构示例 - -### products 表 - -```sql -CREATE TABLE `products` ( - `id` bigint(20) NOT NULL PRIMARY KEY, - `name` varchar(255) NOT NULL, - `sku` varchar(255) DEFAULT NULL, - `price` decimal(10,2) DEFAULT NULL, - `stock_quantity` int(11) DEFAULT NULL, - `created_at` datetime DEFAULT NULL -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; -``` - -### orders 表 - -```sql -CREATE TABLE `orders` ( - `id` bigint(20) NOT NULL PRIMARY KEY, - `status` varchar(50) DEFAULT NULL, - `total` decimal(10,2) DEFAULT NULL, - `currency` varchar(10) DEFAULT NULL, - `created_at` datetime DEFAULT NULL, - `customer_id` bigint(20) DEFAULT NULL, - `billing_first_name` varchar(255) DEFAULT NULL, - `billing_last_name` varchar(255) DEFAULT NULL, - `billing_email` varchar(255) DEFAULT NULL -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; -``` - -## 扩展建议 - -1. **添加更多 API 端点**: 可以根据需要添加更多的 API 端点配置,如客户、分类等 -2. **调整同步频率**: 结合 cron 任务实现定期同步 -3. **增加数据转换**: 使用 DataX 的转换器功能对数据进行预处理 -4. **添加错误处理**: 配置重试机制和错误告警 -5. **监控作业状态**: 结合 Prometheus 和 Grafana 监控作业运行状态 diff --git a/container/datax/job/restful_to_mysql_orders.json b/container/datax/job/restful_to_mysql_orders.json deleted file mode 100644 index 45a0433..0000000 --- a/container/datax/job/restful_to_mysql_orders.json +++ /dev/null @@ -1,106 +0,0 @@ -{ - "job": { - "content": [ - { - "reader": { - "name": "restfulreader", - "parameter": { - "url": "https://your-woocommerce-site.com/wp-json/wc/v3/orders", - "method": "GET", - "headers": { - "Accept": "application/json" - }, - "requestParams": { - "consumer_key": "your_consumer_key", - "consumer_secret": "your_consumer_secret", - "per_page": 100 - }, - "pagination": { - "type": "page", - "pageSize": 100, - "pageNum": 1, - "totalCount": 0, - "dynamicTotalCount": true, - "totalCountPath": "$['X-WP-Total']", - "pageUrlTemplate": "${url}?page=${pageNum}&per_page=${pageSize}&consumer_key=${requestParams.consumer_key}&consumer_secret=${requestParams.consumer_secret}" - }, - "dataPath": "$.*", - "column": [ - { - "name": "id", - "type": "long" - }, - { - "name": "status", - "type": "string" - }, - { - "name": "total", - "type": "string" - }, - { - "name": "currency", - "type": "string" - }, - { - "name": "created_at", - "type": "string" - }, - { - "name": "customer_id", - "type": "long" - }, - { - "name": "billing.first_name", - "type": "string", - "alias": "billing_first_name" - }, - { - "name": "billing.last_name", - "type": "string", - "alias": "billing_last_name" - }, - { - "name": "billing.email", - "type": "string", - "alias": "billing_email" - } - ] - } - }, - "writer": { - "name": "mysqlwriter", - "parameter": { - "writeMode": "replace", - "username": "root", - "password": "123345678", - "column": [ - "id", - "status", - "total", - "currency", - "created_at", - "customer_id", - "billing_first_name", - "billing_last_name", - "billing_email" - ], - "connection": [ - { - "jdbcUrl": "jdbc:mysql://host.docker.internal:23306/inventory_v2?useUnicode=true&characterEncoding=utf-8", - "table": [ - "orders" - ] - } - ] - } - } - } - ], - "setting": { - "speed": { - "channel": 1 - } - } - } -} \ No newline at end of file diff --git a/container/datax/job/restful_to_mysql_products.json b/container/datax/job/restful_to_mysql_products.json deleted file mode 100644 index 3a4772f..0000000 --- a/container/datax/job/restful_to_mysql_products.json +++ /dev/null @@ -1,88 +0,0 @@ -{ - "job": { - "content": [ - { - "reader": { - "name": "restfulreader", - "parameter": { - "url": "https://your-woocommerce-site.com/wp-json/wc/v3/products", - "method": "GET", - "headers": { - "Accept": "application/json" - }, - "requestParams": { - "consumer_key": "your_consumer_key", - "consumer_secret": "your_consumer_secret", - "per_page": 100 - }, - "pagination": { - "type": "page", - "pageSize": 100, - "pageNum": 1, - "totalCount": 0, - "dynamicTotalCount": true, - "totalCountPath": "$['X-WP-Total']", - "pageUrlTemplate": "${url}?page=${pageNum}&per_page=${pageSize}&consumer_key=${requestParams.consumer_key}&consumer_secret=${requestParams.consumer_secret}" - }, - "dataPath": "$.*", - "column": [ - { - "name": "id", - "type": "long" - }, - { - "name": "name", - "type": "string" - }, - { - "name": "sku", - "type": "string" - }, - { - "name": "price", - "type": "string" - }, - { - "name": "stock_quantity", - "type": "long" - }, - { - "name": "created_at", - "type": "string" - } - ] - } - }, - "writer": { - "name": "mysqlwriter", - "parameter": { - "writeMode": "replace", - "username": "root", - "password": "123345678", - "column": [ - "id", - "name", - "sku", - "price", - "stock_quantity", - "created_at" - ], - "connection": [ - { - "jdbcUrl": "jdbc:mysql://host.docker.internal:23306/inventory_v2?useUnicode=true&characterEncoding=utf-8", - "table": [ - "products" - ] - } - ] - } - } - } - ], - "setting": { - "speed": { - "channel": 1 - } - } - } -} \ No newline at end of file diff --git a/container/datax/scripts/check_sync_status.sh b/container/datax/scripts/check_sync_status.sh deleted file mode 100644 index 00e817b..0000000 --- a/container/datax/scripts/check_sync_status.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -echo "======================================" -echo "DataX 同步任务状态监控" -echo "======================================" -echo "检查时间: $(date '+%Y-%m-%d %H:%M:%S')" -echo "======================================" - -# 检查产品同步日志 -PRODUCTS_LOG_DIR="/Users/zksu/Developer/work/workcode/API-vendor/container/datax/log" -PRODUCTS_LOG=$(ls -t ${PRODUCTS_LOG_DIR}/products_sync_*.log 2>/dev/null | head -1) -if [ -f "$PRODUCTS_LOG" ]; then - echo -e "\n产品同步:" - echo "最后执行时间:$(date -r $PRODUCTS_LOG '+%Y-%m-%d %H:%M:%S')" - echo "日志文件:$PRODUCTS_LOG" - echo "执行结果:$(tail -1 $PRODUCTS_LOG)" - echo "日志大小:$(ls -lh $PRODUCTS_LOG | awk '{print $5}')" -else - echo -e "\n产品同步:" - echo "未找到产品同步日志文件" -fi - -# 检查订单同步日志 -ORDERS_LOG=$(ls -t ${PRODUCTS_LOG_DIR}/orders_sync_*.log 2>/dev/null | head -1) -if [ -f "$ORDERS_LOG" ]; then - echo -e "\n订单同步:" - echo "最后执行时间:$(date -r $ORDERS_LOG '+%Y-%m-%d %H:%M:%S')" - echo "日志文件:$ORDERS_LOG" - echo "执行结果:$(tail -1 $ORDERS_LOG)" - echo "日志大小:$(ls -lh $ORDERS_LOG | awk '{print $5}')" -else - echo -e "\n订单同步:" - echo "未找到订单同步日志文件" -fi - -echo -e "\n======================================" -echo "监控结束" -echo "======================================" \ No newline at end of file diff --git a/container/datax/scripts/sync_orders.sh b/container/datax/scripts/sync_orders.sh deleted file mode 100644 index 5bc95fc..0000000 --- a/container/datax/scripts/sync_orders.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -# 设置工作目录 -cd /Users/zksu/Developer/work/workcode/API-vendor/container - -# 记录开始时间 -echo "[$(date '+%Y-%m-%d %H:%M:%S')] 开始执行订单同步..." > /tmp/orders_sync_start.log - -# 执行 DataX 作业 -docker-compose exec -T datax python /datax/bin/datax.py /datax/job/restful_to_mysql_orders.json > /Users/zksu/Developer/work/workcode/API-vendor/container/datax/log/orders_sync_$(date +%Y%m%d_%H%M%S).log 2>&1 - -# 检查执行结果 -if [ $? -eq 0 ]; then - echo "[$(date '+%Y-%m-%d %H:%M:%S')] 订单同步执行成功" >> /tmp/orders_sync_start.log -else - echo "[$(date '+%Y-%m-%d %H:%M:%S')] 订单同步执行失败" >> /tmp/orders_sync_start.log - # 发送告警通知(示例:企业微信机器人,如需使用请取消注释并修改webhook地址) - # curl -s -H "Content-Type: application/json" -d '{"msgtype": "text", "text": {"content": "订单同步执行失败,请查看日志"}}' https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=your_webhook_key -fi \ No newline at end of file diff --git a/container/datax/scripts/sync_products.sh b/container/datax/scripts/sync_products.sh deleted file mode 100644 index e10a767..0000000 --- a/container/datax/scripts/sync_products.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -# 设置工作目录 -cd /Users/zksu/Developer/work/workcode/API-vendor/container - -# 记录开始时间 -echo "[$(date '+%Y-%m-%d %H:%M:%S')] 开始执行产品同步..." > /tmp/products_sync_start.log - -# 执行 DataX 作业 -docker-compose exec -T datax python /datax/bin/datax.py /datax/job/restful_to_mysql_products.json > /Users/zksu/Developer/work/workcode/API-vendor/container/datax/log/products_sync_$(date +%Y%m%d_%H%M%S).log 2>&1 - -# 检查执行结果 -if [ $? -eq 0 ]; then - echo "[$(date '+%Y-%m-%d %H:%M:%S')] 产品同步执行成功" >> /tmp/products_sync_start.log -else - echo "[$(date '+%Y-%m-%d %H:%M:%S')] 产品同步执行失败" >> /tmp/products_sync_start.log - # 发送告警通知(示例:企业微信机器人,如需使用请取消注释并修改webhook地址) - # curl -s -H "Content-Type: application/json" -d '{"msgtype": "text", "text": {"content": "产品同步执行失败,请查看日志"}}' https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=your_webhook_key -fi \ No newline at end of file diff --git a/container/docker-compose.yml b/container/docker-compose.yml index 4c68848..c1bc168 100644 --- a/container/docker-compose.yml +++ b/container/docker-compose.yml @@ -77,17 +77,6 @@ services: networks: - app-network - # DataX 服务 - datax: - image: beginor/datax - container_name: datax-container - restart: on-failure - volumes: - - ./datax/job:/datax/job - - ./datax/log:/datax/log - networks: - - app-network - # volumes: # mysql-data: # driver: local diff --git a/docs/datax/scheduled_execution.md b/docs/datax/scheduled_execution.md deleted file mode 100644 index 668ea6d..0000000 --- a/docs/datax/scheduled_execution.md +++ /dev/null @@ -1,319 +0,0 @@ -# DataX 同步脚本定时执行指南 - -本文档介绍了如何定时执行 DataX 同步脚本的两种主要方式:宿主机 crontab 和容器内 crontab。 - -## 1. 宿主机 Crontab 方式 - -### 1.1 原理 - -在宿主机上使用 `crontab` 工具,通过 `docker-compose exec` 命令定期在 DataX 容器内执行同步脚本。 - -### 1.2 配置步骤 - -#### 1.2.1 创建执行脚本 - -首先,在宿主机上创建一个执行脚本,用于运行 DataX 作业: - -```bash -# 创建脚本目录 -mkdir -p /Users/zksu/Developer/work/workcode/API-vendor/container/datax/scripts - -# 创建产品同步脚本 -touch /Users/zksu/Developer/work/workcode/API-vendor/container/datax/scripts/sync_products.sh -chmod +x /Users/zksu/Developer/work/workcode/API-vendor/container/datax/scripts/sync_products.sh - -# 创建订单同步脚本 -touch /Users/zksu/Developer/work/workcode/API-vendor/container/datax/scripts/sync_orders.sh -chmod +x /Users/zksu/Developer/work/workcode/API-vendor/container/datax/scripts/sync_orders.sh -``` - -#### 1.2.2 编写脚本内容 - -**产品同步脚本** (`sync_products.sh`): - -```bash -#!/bin/bash - -# 设置工作目录 -cd /Users/zksu/Developer/work/workcode/API-vendor/container - -# 执行 DataX 作业 -docker-compose exec -T datax python /datax/bin/datax.py /datax/job/restful_to_mysql_products.json > /Users/zksu/Developer/work/workcode/API-vendor/container/datax/log/products_sync_$(date +%Y%m%d_%H%M%S).log 2>&1 -``` - -**订单同步脚本** (`sync_orders.sh`): - -```bash -#!/bin/bash - -# 设置工作目录 -cd /Users/zksu/Developer/work/workcode/API-vendor/container - -# 执行 DataX 作业 -docker-compose exec -T datax python /datax/bin/datax.py /datax/job/restful_to_mysql_orders.json > /Users/zksu/Developer/work/workcode/API-vendor/container/datax/log/orders_sync_$(date +%Y%m%d_%H%M%S).log 2>&1 -``` - -#### 1.2.3 配置 Crontab - -1. 编辑宿主机的 crontab 配置: - -```bash -crontab -e -``` - -2. 添加定时任务: - -```bash -# 每小时执行一次产品同步 -0 * * * * /Users/zksu/Developer/work/workcode/API-vendor/container/datax/scripts/sync_products.sh - -# 每2小时执行一次订单同步 -0 */2 * * * /Users/zksu/Developer/work/workcode/API-vendor/container/datax/scripts/sync_orders.sh - -# 每天凌晨3点执行所有同步 -0 3 * * * /Users/zksu/Developer/work/workcode/API-vendor/container/datax/scripts/sync_products.sh && /Users/zksu/Developer/work/workcode/API-vendor/container/datax/scripts/sync_orders.sh -``` - -3. 保存并退出编辑器。 - -### 1.3 优点 - -- **配置简单**:无需修改 Docker 容器配置 -- **易于管理**:所有定时任务集中在宿主机管理 -- **日志管理方便**:可以将日志输出到宿主机指定位置 -- **不依赖容器状态**:即使容器重启,定时任务依然有效 - -### 1.4 缺点 - -- **依赖宿主机**:宿主机故障会导致定时任务失效 -- **需要宿主机权限**:需要在宿主机上配置 crontab - -## 2. 容器内 Crontab 方式 - -### 2.1 原理 - -在 DataX 容器内部安装并配置 `crontab` 服务,直接在容器内定时执行同步脚本。 - -### 2.2 配置步骤 - -#### 2.2.1 自定义 Docker 镜像 - -1. 创建 `Dockerfile`: - -```bash -mkdir -p /Users/zksu/Developer/work/workcode/API-vendor/container/datax/docker -``` - -2. 编写 `Dockerfile`: - -```dockerfile -FROM beginor/datax - -# 安装 crontab -RUN apt-get update && apt-get install -y cron && rm -rf /var/lib/apt/lists/* - -# 创建 crontab 配置文件 -RUN touch /etc/crontab /etc/cron.d/datax-cron - -# 给执行权限 -RUN chmod 0644 /etc/cron.d/datax-cron - -# 创建日志目录 -RUN mkdir -p /var/log/cron - -# 复制作业脚本 -COPY job/ /datax/job/ - -# 设置环境变量 -ENV TZ=Asia/Shanghai - -# 启动 crontab 服务 -CMD service cron start && tail -f /dev/null -``` - -3. 构建自定义镜像: - -```bash -cd /Users/zksu/Developer/work/workcode/API-vendor/container/datax/docker -docker build -t custom/datax . -``` - -4. 修改 `docker-compose.yml` 中的镜像: - -```yaml -datax: - image: custom/datax - # ... 其他配置不变 -``` - -#### 2.2.2 配置容器内 Crontab - -1. 进入容器: - -```bash -docker-compose exec datax bash -``` - -2. 编辑 crontab 配置: - -```bash -crontab -e -``` - -3. 添加定时任务: - -```bash -# 每小时执行一次产品同步 -0 * * * * python /datax/bin/datax.py /datax/job/restful_to_mysql_products.json > /datax/log/products_sync_$(date +\%Y\%m\%d_\%H\%M\%S).log 2>&1 - -# 每2小时执行一次订单同步 -0 */2 * * * python /datax/bin/datax.py /datax/job/restful_to_mysql_orders.json > /datax/log/orders_sync_$(date +\%Y\%m\%d_\%H\%M\%S).log 2>&1 -``` - -4. 保存并退出编辑器。 - -5. 重启 crontab 服务: - -```bash -service cron restart -``` - -### 2.3 优点 - -- **容器化管理**:定时任务与容器绑定,便于迁移和部署 -- **不依赖宿主机**:容器可以在任何 Docker 环境中运行,定时任务依然有效 -- **隔离性好**:定时任务只影响当前容器 - -### 2.4 缺点 - -- **配置复杂**:需要自定义 Docker 镜像 -- **日志管理**:日志默认保存在容器内部,需要通过卷挂载到宿主机 -- **容器重启影响**:容器重启后需要确保 crontab 服务自动启动 - -## 3. 定时任务监控 - -### 3.1 日志监控 - -- **宿主机方式**:直接查看宿主机上的日志文件 -- **容器内方式**:通过 `docker-compose logs -f datax` 查看容器日志,或通过卷挂载查看日志文件 - -### 3.2 执行状态监控 - -1. **创建监控脚本**: - -```bash -#!/bin/bash - -# 检查产品同步日志 -PRODUCTS_LOG=$(ls -t /Users/zksu/Developer/work/workcode/API-vendor/container/datax/log/products_sync_*.log | head -1) -if [ -f "$PRODUCTS_LOG" ]; then - echo "产品同步最后执行时间:$(date -r $PRODUCTS_LOG)" - echo "执行结果:$(tail -1 $PRODUCTS_LOG)" -else - echo "产品同步日志不存在" -fi - -# 检查订单同步日志 -ORDERS_LOG=$(ls -t /Users/zksu/Developer/work/workcode/API-vendor/container/datax/log/orders_sync_*.log | head -1) -if [ -f "$ORDERS_LOG" ]; then - echo "订单同步最后执行时间:$(date -r $ORDERS_LOG)" - echo "执行结果:$(tail -1 $ORDERS_LOG)" -else - echo "订单同步日志不存在" -fi -``` - -2. 给脚本执行权限: - -```bash -chmod +x /Users/zksu/Developer/work/workcode/API-vendor/container/datax/scripts/check_sync_status.sh -``` - -3. 执行监控脚本: - -```bash -/Users/zksu/Developer/work/workcode/API-vendor/container/datax/scripts/check_sync_status.sh -``` - -### 3.3 告警机制 - -可以结合以下工具实现告警机制: - -- **Prometheus + Grafana**:监控同步任务执行状态和时间 -- **ELK Stack**:收集和分析同步日志,设置告警规则 -- **企业微信/钉钉机器人**:通过脚本检查执行结果,发送告警通知 - -## 4. 最佳实践 - -### 4.1 选择合适的方式 - -- **小型项目/测试环境**:推荐使用宿主机 crontab 方式,配置简单 -- **大型项目/生产环境**:推荐使用容器内 crontab 方式,便于管理和迁移 -- **需要高可用**:考虑使用专门的任务调度工具,如 Apache Airflow、XXL-JOB 等 - -### 4.2 日志管理 - -- **设置合理的日志保留时间**:避免日志文件过大 -- **使用日志轮转**:定期压缩和清理旧日志 -- **集中日志管理**:将日志发送到 ELK 或其他日志管理系统 - -### 4.3 错误处理 - -- **添加重试机制**:在脚本中添加失败重试逻辑 -- **设置超时时间**:避免单个任务执行时间过长 -- **及时告警**:同步失败时立即发送告警通知 - -### 4.4 性能优化 - -- **合理设置同步频率**:根据数据更新频率设置合适的同步间隔 -- **使用增量同步**:只同步新增或修改的数据,减少同步时间 -- **调整并发数**:根据系统资源和 API 限制调整 DataX 的并发通道数 - -## 5. 示例:完整的宿主机 Crontab 配置 - -### 5.1 产品同步脚本 (`sync_products.sh`) - -```bash -#!/bin/bash - -# 设置工作目录 -cd /Users/zksu/Developer/work/workcode/API-vendor/container - -# 记录开始时间 -echo "[$(date '+%Y-%m-%d %H:%M:%S')] 开始执行产品同步..." > /tmp/products_sync_start.log - -# 执行 DataX 作业 -docker-compose exec -T datax python /datax/bin/datax.py /datax/job/restful_to_mysql_products.json > /Users/zksu/Developer/work/workcode/API-vendor/container/datax/log/products_sync_$(date +%Y%m%d_%H%M%S).log 2>&1 - -# 检查执行结果 -if [ $? -eq 0 ]; then - echo "[$(date '+%Y-%m-%d %H:%M:%S')] 产品同步执行成功" >> /tmp/products_sync_start.log -else - echo "[$(date '+%Y-%m-%d %H:%M:%S')] 产品同步执行失败" >> /tmp/products_sync_start.log - # 发送告警通知(示例:企业微信机器人) - curl -s -H "Content-Type: application/json" -d '{"msgtype": "text", "text": {"content": "产品同步执行失败,请查看日志"}}' https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=your_webhook_key -fi -``` - -### 5.2 Crontab 配置 - -```bash -# 每小时执行一次产品同步 -0 * * * * /Users/zksu/Developer/work/workcode/API-vendor/container/datax/scripts/sync_products.sh - -# 每2小时执行一次订单同步 -0 */2 * * * /Users/zksu/Developer/work/workcode/API-vendor/container/datax/scripts/sync_orders.sh - -# 每天凌晨3点执行所有同步 -0 3 * * * /Users/zksu/Developer/work/workcode/API-vendor/container/datax/scripts/sync_products.sh && /Users/zksu/Developer/work/workcode/API-vendor/container/datax/scripts/sync_orders.sh - -# 每天凌晨4点清理7天前的日志 -0 4 * * * find /Users/zksu/Developer/work/workcode/API-vendor/container/datax/log -name "*.log" -mtime +7 -delete -``` - -## 6. 扩展阅读 - -- [Crontab 语法详解](https://crontab.guru/) -- [Docker 容器中使用 Crontab](https://docs.docker.com/config/containers/multi-service_container/) -- [Apache Airflow 官方文档](https://airflow.apache.org/docs/) -- [XXL-JOB 官方文档](https://www.xuxueli.com/xxl-job/) diff --git a/docs/datax/woocommerce_api_config.md b/docs/datax/woocommerce_api_config.md deleted file mode 100644 index dde0a4a..0000000 --- a/docs/datax/woocommerce_api_config.md +++ /dev/null @@ -1,312 +0,0 @@ -# Woocommerce API 配置指南 - -## 1. 认证方式配置 - -Woocommerce API 支持两种认证方式:Basic Auth 和 OAuth 1.0a。对于使用 `customer_key` 和 `customer_secret` 的情况,我们需要使用 OAuth 1.0a 认证。 - -### 1.1 OAuth 1.0a 认证配置 - -在 DataX 的 RESTful Reader 中,我们可以通过自定义请求参数和签名来实现 OAuth 1.0a 认证。以下是配置示例: - -```json -{ - "reader": { - "name": "restfulreader", - "parameter": { - "url": "https://your-woocommerce-site.com/wp-json/wc/v3/products", - "method": "GET", - "headers": { - "Accept": "application/json" - }, - "requestParams": { - "consumer_key": "your_consumer_key", - "consumer_secret": "your_consumer_secret" - }, - "pagination": { - "type": "page", - "pageSize": 100, - "pageNum": 1, - "totalCount": 1000 - }, - "dataPath": "$.*", - "column": [ - // 列配置 - ] - } - } -} -``` - -### 1.2 认证参数说明 - -- `consumer_key`: Woocommerce 后台生成的消费者密钥 -- `consumer_secret`: Woocommerce 后台生成的消费者密钥密码 - -### 1.3 获取消费者密钥 - -1. 登录 Woocommerce 后台 -2. 进入 `WooCommerce > 设置 > 高级 > REST API` -3. 点击 `添加密钥` 按钮 -4. 填写描述,选择用户和权限 -5. 点击 `生成 API 密钥` -6. 保存生成的 `consumer_key` 和 `consumer_secret` - -## 2. 分页获取全部数据 - -Woocommerce API 默认一次最多返回 100 条数据,要获取全部数据,我们需要使用分页机制。DataX 的 RESTful Reader 支持多种分页方式,这里我们使用基于页码的分页。 - -### 2.1 分页配置示例 - -```json -{ - "pagination": { - "type": "page", - "pageSize": 100, - "pageNum": 1, - "totalCount": 1000, - "pageToken": { - "offset": 0, - "limit": 100 - }, - "pageUrlTemplate": "${url}?page=${pageNum}&per_page=${pageSize}" - } -} -``` - -### 2.2 分页参数说明 - -| 参数名 | 说明 | 默认值 | -|--------|------|--------| -| `type` | 分页类型,支持 `page`(基于页码)、`offset`(基于偏移量)、`cursor`(基于游标) | 无 | -| `pageSize` | 每页数据条数 | 100 | -| `pageNum` | 起始页码 | 1 | -| `totalCount` | 预估总数据量,用于计算总页数 | 1000 | -| `pageUrlTemplate` | 分页 URL 模板,用于构建请求 URL | 无 | - -### 2.3 动态获取总数据量 - -为了确保获取全部数据,我们可以先请求一次 API 获取总数据量,然后再进行分页请求。以下是实现思路: - -1. 首先发送一次请求,获取总数据量 -2. 根据总数据量和每页条数计算总页数 -3. 遍历所有页码,获取全部数据 - -### 2.4 完整的分页配置示例 - -```json -{ - "job": { - "content": [ - { - "reader": { - "name": "restfulreader", - "parameter": { - "url": "https://your-woocommerce-site.com/wp-json/wc/v3/products", - "method": "GET", - "headers": { - "Accept": "application/json" - }, - "requestParams": { - "consumer_key": "your_consumer_key", - "consumer_secret": "your_consumer_secret", - "per_page": 100 - }, - "pagination": { - "type": "page", - "pageSize": 100, - "pageNum": 1, - "totalCount": 0, - "dynamicTotalCount": true, - "totalCountPath": "$['X-WP-Total']", - "pageUrlTemplate": "${url}?page=${pageNum}&per_page=${pageSize}&consumer_key=${requestParams.consumer_key}&consumer_secret=${requestParams.consumer_secret}" - }, - "dataPath": "$.*", - "column": [ - { - "name": "id", - "type": "long" - }, - { - "name": "name", - "type": "string" - }, - { - "name": "sku", - "type": "string" - }, - { - "name": "price", - "type": "string" - }, - { - "name": "stock_quantity", - "type": "long" - }, - { - "name": "created_at", - "type": "string" - } - ] - } - }, - "writer": { - // 写入配置 - } - } - ], - "setting": { - "speed": { - "channel": 1 - } - } - } -} -``` - -## 3. 完整的作业配置示例 - -以下是一个完整的从 Woocommerce API 获取产品数据并写入 MySQL 的作业配置,包含了 OAuth 1.0a 认证和分页配置: - -```json -{ - "job": { - "content": [ - { - "reader": { - "name": "restfulreader", - "parameter": { - "url": "https://your-woocommerce-site.com/wp-json/wc/v3/products", - "method": "GET", - "headers": { - "Accept": "application/json" - }, - "requestParams": { - "consumer_key": "ck_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", - "consumer_secret": "cs_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", - "per_page": 100 - }, - "pagination": { - "type": "page", - "pageSize": 100, - "pageNum": 1, - "totalCount": 0, - "dynamicTotalCount": true, - "totalCountPath": "$['X-WP-Total']", - "pageUrlTemplate": "${url}?page=${pageNum}&per_page=${pageSize}&consumer_key=${requestParams.consumer_key}&consumer_secret=${requestParams.consumer_secret}" - }, - "dataPath": "$.*", - "column": [ - { - "name": "id", - "type": "long" - }, - { - "name": "name", - "type": "string" - }, - { - "name": "sku", - "type": "string" - }, - { - "name": "price", - "type": "string" - }, - { - "name": "stock_quantity", - "type": "long" - }, - { - "name": "created_at", - "type": "string" - } - ] - } - }, - "writer": { - "name": "mysqlwriter", - "parameter": { - "writeMode": "replace", - "username": "root", - "password": "123345678", - "column": [ - "id", - "name", - "sku", - "price", - "stock_quantity", - "created_at" - ], - "connection": [ - { - "jdbcUrl": "jdbc:mysql://host.docker.internal:23306/inventory_v2?useUnicode=true&characterEncoding=utf-8", - "table": [ - "products" - ] - } - ] - } - } - } - ], - "setting": { - "speed": { - "channel": 1 - } - } - } -} -``` - -## 4. 常见问题及解决方案 - -### 4.1 认证失败 - -**问题**:API 请求返回 401 Unauthorized - -**解决方案**: -1. 检查 `consumer_key` 和 `consumer_secret` 是否正确 -2. 确保 API 用户具有足够的权限 -3. 检查 API 版本是否正确 - -### 4.2 分页失效 - -**问题**:只获取到部分数据,分页没有生效 - -**解决方案**: -1. 检查分页类型配置是否正确 -2. 确保 `pageUrlTemplate` 配置正确 -3. 检查 `totalCount` 是否设置合理 -4. 启用 `dynamicTotalCount` 动态获取总数据量 - -### 4.3 请求频率限制 - -**问题**:API 请求返回 429 Too Many Requests - -**解决方案**: -1. 减少每页请求的数据量 -2. 增加请求间隔时间 -3. 联系 Woocommerce 支持调整 API 速率限制 - -### 4.4 数据类型不匹配 - -**问题**:数据写入数据库时类型不匹配 - -**解决方案**: -1. 检查 DataX 配置中的列类型与数据库表结构是否匹配 -2. 使用 DataX 的转换器功能进行数据类型转换 -3. 在读取阶段指定正确的数据类型 - -## 5. 性能优化建议 - -1. **调整分页大小**:根据 API 限制和网络状况调整 `pageSize`,建议设置为 API 允许的最大值(100) -2. **增加并发通道**:在 DataX 作业配置中增加 `channel` 数量,提高数据同步速度 -3. **使用增量同步**:通过添加时间过滤条件,只同步新增或修改的数据 -4. **优化数据库写入**:使用 `insert` 或 `update` 模式替代 `replace` 模式,减少数据库操作开销 -5. **定期清理日志**:定期清理 DataX 日志文件,避免磁盘空间不足 - -## 6. 扩展阅读 - -- [Woocommerce REST API 文档](https://woocommerce.github.io/woocommerce-rest-api-docs/) -- [DataX RESTful Reader 文档](https://github.com/alibaba/DataX/blob/master/restfulreader/doc/restfulreader.md) -- [DataX MySQL Writer 文档](https://github.com/alibaba/DataX/blob/master/mysqlwriter/doc/mysqlwriter.md)