步骤 1:定义数据库表
sql
CREATE TABLE spider_data (
id INT AUTO_INCREMENT PRIMARY KEY,
url VARCHAR(255) NOT NULL,
content TEXT,
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
status ENUM('success', 'error') DEFAULT 'success',
user_agent VARCHAR(255),
ip_address VARCHAR(15)
);
步骤 2:创建存储过程
sql
DELIMITER $$
CREATE PROCEDURE InsertSpiderData(
IN url VARCHAR(255),
IN content TEXT,
IN user_agent VARCHAR(255),
IN ip_address VARCHAR(15)
)
BEGIN
插入数据
INSERT INTO spider_data (url, content, user_agent, ip_address)
VALUES (url, content, user_agent, ip_address);
3、扩展功能:定时任务与触发器
3.1 使用 `EVENT` 定时执行存储过程
sql
CREATE EVENT daily_spider_job
ON SCHEDULE EVERY 1 DAY
DO CALL InsertSpiderData('https://65.hk', 'HTML内容', 'Mozilla/5.0', '192.168.1.1');
3.2 使用触发器自动插入数据
sql
DELIMITER $$
CREATE TRIGGER after_crawler_run
AFTER INSERT ON crawler_run
FOR EACH ROW
BEGIN
INSERT INTO spider_data (url, content, user_agent, ip_address)
VALUES (NEW.url, NEW.content, NEW.user_agent, NEW.ip_address);
END $$
CREATE PROCEDURE CleanData(IN data TEXT)
BEGIN
DECLARE cleaned TEXT;
SET cleaned = REPLACE(REPLACE(data, '<html', ''), '</html', '');
UPDATE spider_data SET content = cleaned WHERE content LIKE '%<html%';
END $$