老司机开车记
node.js+jsdom 小爬虫 并不是框架>.<
解决某些个人问题:
利用了dfs搜索
"use strict";
let https=require('https');
let fs=require('fs');
let path=require('path');
let jsdom=require('jsdom');
let visted=[];function isDir(url){if(url.indexOf('.')!==-1){return false;}try{fs.mkdirSync(url);console.log('mkdir:'+url);}catch(e){console.log(`IOerror:${e}`)}return true;
}function isUrlVisited(url){for(let i=0;i<visted.length;i++){if(visted[i]===url){return true;}}visted.push(url);return false;
}function writeFile(url){if(isDir(url)&&isUrlVisited(url)){return;}let filePath=path.parse(url);try{urltoFile(url);}catch(error){console.log(error)}}function urltoFile(url){https.request(root+url,(res)=>{res.on('data',(data)=>{fs.appendFileSync(url,data);});}).on('error',(e)=>{console.log(e);}).end();
}function dealUrl(url){isUrlVisited(url);isDir(url);writeFile(url);}
let root='https://www.seryox.com';
function applyUrl(url){jsdom.env({url: url,scripts: ["http://code.jquery.com/jquery.js"],done: function (err, window) {let $=window.$;console.log('done');try{let arr=$('a');console.log(arr.length)for(let i=0;i<arr.length;i++){let href=$(arr[i]).attr('href');if(href.match(/^\/pic/)&&!isUrlVisited(href)){console.log(href);if(isDir(href)){console.log('dir:'+url+href);applyUrl(root+href);}else{console.log('file:'+url+href);writeFile(href);}}}}catch(e){console.log(e+'@'+url);}}});
}
applyUrl(root+'/');
生成的文件夹编码为UTF-8 URL编码
利用此文件可解决
"use strict";
let fs=require('fs');
let root='/pic';function isDir(url){if(url.indexOf('.')!==-1){return false;}return true;
}function rename(path,name){if(name.indexOf('%')!==-1)console.log(path+'/'+name+' to '+path+'/'+decodeURI(name))fs.renameSync(path+'/'+name,path+'/'+decodeURI(name));
}function main(path){let fd=fs.readdirSync(path);for(let i=0;i<fd.length;i++){if(isDir(fd[i])){main(path+'/'+fd[i]);}rename(path,fd[i]);}
}
main(root);