自娱小程序--超大文件topN

 设计思路:
new 一个线程沲
打开文件,Nio或Reader,NIO打开,Map一大块MappedByteBuffer,从Buffer中读出一定大小的数据,定位到最后一个'\n',最后一个'\n'及之前的数据put到一个线程执行类实例Buffer中,余下的put到一个临时Buffer里,下一次循环时处理这部分内容,在线程的执行体中,先行rewind bytebuffer,循环处理buffer,读到一个完整的import语句put到map里,buffer处理完成后合并map到全局concurrentmap中。BIO的则是读一定的行数后submit线程到线程沲,之后,用正则表达式处理每一行生成map,处理完成后合并map
上代码:
=========================NIO================================= 

package com.arvey.files;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

import java.nio.ByteBuffer;

import java.nio.MappedByteBuffer;

import java.nio.channels.FileChannel;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Comparator;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.concurrent.ConcurrentHashMap;

import java.util.concurrent.ExecutorService;

import java.util.concurrent.Executors;

public class StateCounterNIO {

public static final int mappedSize = 5*4*1024;

public static final int handleSize = 4*1024;

public final static int ExecutorsNum = 200;

String file="/Users/arvey/wwork/Docs.code.clean/docsapp/bigfile.src";

//String file="/Users/arvey/wwork/Docs.full/source.code.201512.clean/bigfile.src";

//String file="/Users/arvey/wwork/gsafety.code.donotdeleted/cloud-core/bigfile.src";

//String file="/Users/arvey/wwork/Docs.full/source.code.201512.clean/CleanerType.java";

public static ConcurrentHashMap<String,Integer> result = new ConcurrentHashMap<String,Integer>();

//public static Pattern pattern = Pattern.compile("^(import.+);");

private ExecutorService pool = Executors.newFixedThreadPool(ExecutorsNum);

//private HandleBuffer aHandle;

public static synchronized void updateTopList(Map<String,Integer> partial){

for(String key:partial.keySet()){

if(result.containsKey(key)){

result.put(key, (Integer)result.get(key)+(Integer)partial.get(key));

}else

result.put(key, (Integer)partial.get(key));

}

}

public void getTop10(){

File aFile = new File(file);

long fileLength = aFile.length();

FileInputStream fis = null;

FileChannel fc = null;

long foffset = 0L;

//MappedByteBuffer buffer = (MappedByteBuffer) MappedByteBuffer.allocate(5*4*1024);

MappedByteBuffer buffer = null;

ByteBuffer tmpByteBuffer = ByteBuffer.allocate(StateCounterNIO.handleSize);

byte[] tmpByteArray = new byte[StateCounterNIO.handleSize];

try {

fis = new FileInputStream(aFile);

fc = fis.getChannel();

while(foffset<fileLength){

long buffersize = Math.min(fileLength-foffset,StateCounterNIO.mappedSize);

buffer = fc.map(FileChannel.MapMode.READ_ONLY, foffset, buffersize);

while( buffer.position() < buffersize ){

HandleBuffer aHandle = new HandleBuffer();

//boolean submit = false;

if(tmpByteBuffer.position() > 0){

byte[] tmpba = new byte[tmpByteBuffer.position()];

tmpByteBuffer.rewind();

tmpByteBuffer.get(tmpba);

aHandle.getMbuffer().put(tmpba);

tmpByteBuffer.clear();

}

int tmpBACap = Math.min(Math.min(StateCounterNIO.handleSize, (int)(buffersize - buffer.position())), StateCounterNIO.handleSize - aHandle.getMbuffer().position() );

buffer.get(tmpByteArray,0,tmpBACap);

//end of file

if(buffer.position() == buffersize && (foffset+buffersize == fileLength)){

aHandle.getMbuffer().put(tmpByteArray,0,tmpBACap);

} else {

for( int i = tmpBACap-1;i>=0;i-- ){

if(i == 0){//this means that no '\n' in the whole buffer, then put full handle buffer and submit

tmpByteBuffer.put(tmpByteArray,0,tmpBACap);

if ( tmpByteArray[i] == '\n'){

aHandle.getMbuffer().put(tmpByteArray, 0, i);

//put those byte into tmpByteBuffer which will handle with next buffer

if( i != tmpBACap-1 )

tmpByteBuffer.put(tmpByteArray,i,tmpBACap-i);

break;

}

}

}

pool.submit( aHandle );

}

foffset += buffer.position();

buffer.clear();

}

//if(pool.

} catch (FileNotFoundException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} finally {

if (fis != null)

try {

fis.close();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

if( fc != null )

try {

fc.close();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

pool.shutdown();

while(!pool.isTerminated()){

try {

Thread.sleep(2000);

} catch (InterruptedException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

}

class HandleBuffer implements Runnable{

ByteBuffer mbuffer = ByteBuffer.allocate(4*1024);

public ByteBuffer getMbuffer(){

return mbuffer;

}

@Override

public void run() {

Map<String,Integer> aMap = new HashMap<String,Integer>();

byte[] bimport = "import ".getBytes();

int bimport_index = 0,markedpos = 0; 

boolean isImportline = false;

int availabesize = mbuffer.position();

mbuffer.rewind();

while(mbuffer.position() < availabesize)

{

//mbuffer.

byte abyte = mbuffer.get();

if(!isImportline && bimport_index< bimport.length && abyte == bimport[bimport_index] ){

bimport_index++;

if( bimport_index == bimport.length ){

isImportline = true;

markedpos = mbuffer.position() - bimport.length;

}

else if( abyte == '\n' && isImportline){

byte[] tmp = new byte[mbuffer.position() - markedpos];

mbuffer.position(markedpos);

mbuffer.get(tmp);

String aImport = new String( tmp ).trim();

if(aMap.containsKey(aImport)){

aMap.put(aImport, (Integer)aMap.get(aImport)+1);

}else{

aMap.put(aImport, 1);

}

isImportline = false;

bimport_index=0;

} else if(!isImportline && bimport_index != 0){//清除没有读到完整"import "时的index

bimport_index = 0;

}

}

StateCounterNIO.updateTopList(aMap);

}

}

public static void main(String[] args) {

// TODO Auto-generated method stub

long startat = System.currentTimeMillis();

StateCounterNIO aNIO = new StateCounterNIO();

aNIO.getTop10();

List<Map.Entry<String,Integer>> slist = new ArrayList<Map.Entry<String,Integer>>(result.entrySet());

Collections.sort(slist,new Comparator<Map.Entry<String,Integer>>(){

@Override

public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {

if(o2.getValue()!=null&&o1.getValue()!=null&&o2.getValue().compareTo(o1.getValue())>0){  

return 1;

}else{

return -1;

}

}

});

int index=0;

for(Map.Entry<String,Integer> aEntry: slist){

System.out.println(aEntry.getKey() + "--"+ aEntry.getValue());

if(index++>=100)

break;

//System.out.println("The Thread counter is " + aCount.getPoolcounter());

System.out.println("The cost is " + (System.currentTimeMillis()-startat) );

}

}

==================================BIO======================================

package com.arvey.files;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

import java.io.InputStreamReader;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Comparator;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.TreeMap;

import java.util.concurrent.ConcurrentHashMap;

import java.util.concurrent.ExecutorService;

import java.util.concurrent.Executors;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

public class StateCountBIO {

public final static int ExecutorsNum = 200;

ExecutorService pool = Executors.newFixedThreadPool(ExecutorsNum);

String file="/Users/arvey/wwork/Docs.code.clean/docsapp/bigfile.src";

//String file="/Users/arvey/wwork/Docs.full/source.code.201512.clean/bigfile.src";

//String file="/Users/arvey/wwork/gsafety.code.donotdeleted/cloud-core/bigfile.src";

//String file="/Users/arvey/wwork/Docs.full/source.code.201512.clean/CleanerType.java";

public static ConcurrentHashMap<String,Integer> result = new ConcurrentHashMap<String,Integer>();

public static Pattern pattern = Pattern.compile("^(import.+);");

private BufferedReader freader;

private int poolcounter = 0;

public int getPoolcounter(){

return poolcounter;

}

public static synchronized void updateTopList(Map<String,Integer> partial){

for(String key:partial.keySet()){

if(result.containsKey(key)){

result.put(key, (Integer)result.get(key)+(Integer)partial.get(key));

}else

result.put(key, (Integer)partial.get(key));

}

}

public void getTop10(){

File bigfile = new File(file);

FileInputStream fio = null;

try {

fio = new FileInputStream(bigfile);

//BufferedReader freader = new BufferedReader(new InputStreamReader(new FileInputStream(bigfile)));

//FileChannel frchannel = fio.getChannel();

InputStreamReader areader = new InputStreamReader(fio);

freader = new BufferedReader(areader);

boolean notreachedend = true;

while(notreachedend){

String content=null;

int index = 0;

HandleRun anInst = new HandleRun();

StringBuilder aWriter = anInst.getBuffer();

while (index < 1024){

if( (content = freader.readLine())!=null){

aWriter.append(content+"\n");

index++;

} else {

notreachedend = false;

break;

}

}

//System.out.println("Start a new Thread!");

poolcounter++;

pool.submit(anInst);

}

//areader

} catch (FileNotFoundException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} finally{

try {

fio.close();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

pool.shutdown();

while( !pool.isTerminated() ){

try {

Thread.sleep( 2000 );

} catch (InterruptedException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

}

class HandleRun implements Runnable{

StringBuilder buffer = new StringBuilder();

public StringBuilder getBuffer(){

return buffer;

}

@Override

public void run() {

// TODO Auto-generated method stub

String[] allLines = buffer.toString().split("\n");

Map<String,Integer> res = new HashMap<String,Integer>();

for(String aLine:allLines){

Matcher m = pattern.matcher(aLine);

if(m.find()){

String key = m.group(0);

if(res.containsKey(key)){

res.put(key, (Integer)res.get(key)+1);

}else

res.put(key, 1);

}

}

StateCountBIO.updateTopList(res);

//System.out.println("The current Thread complementd!");

}

}

static class ValueComparator implements Comparator<String>{

Map<String,Integer> map = result;

public ValueComparator(){

}

@Override

public int compare(String o1, String o2) {

// TODO Auto-generated method stub

if(map.get(o1) >= map.get(o2))

return -1;

else

return 1;

}

}

public static void main(String[] args) {

// TODO Auto-generated method stub

long startat = System.currentTimeMillis();

StateCountBIO aCount = new StateCountBIO();

aCount.getTop10();

List<Map.Entry<String,Integer>> slist = new ArrayList<Map.Entry<String,Integer>>(result.entrySet());

Collections.sort(slist,new Comparator<Map.Entry<String, Integer>>(){

public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2){

 if(o2.getValue()!=null&&o1.getValue()!=null&&o2.getValue().compareTo(o1.getValue())>0){  

           return 1;  

          }else{  

           return -1;  

          }

}

});

int index=0;

for(Map.Entry<String,Integer> aEntry: slist){

System.out.println(aEntry.getKey() + "--"+ aEntry.getValue());

if(index++>=100)

break;

System.out.println("The Thread counter is " + aCount.getPoolcounter());

System.out.println("The cost is " + (System.currentTimeMillis()-startat) );

//for(String key:sorted_map.keySet()){

//System.out.println(key+"----"+sorted_map.get(key));

//}

//for(String key:result.keySet()){

//System.out.println(key+"----"+result.get(key));

//}

}

}
=====================================================
效率分析
处理大文件 文件size达到8967006720时 线程沲200 100 50对比如下(五次执行平均结果):
                       NIO                BIO
200                139843          67376 
100                136914          66576
50                  140000          67249  
为何NIO的要慢于BIO的呢?
NIO在处理线程中遍例buffer,是不是这个原因造成的呢?当增加每次buffer处理的容量时,性能提升明显,如文件每次map的和每一个线程处理的buffer的空间扩容10增时,在50个线程时,数据降到82439,但是对于BIO的调整一次处理的行数,性能变化很小,程序运行时间略有增长(726**),then什么才能获得最好的性能呢!

时间: 2024-10-03 09:10:39

自娱小程序--超大文件topN的相关文章

自娱小程序--九宫格之python版

#/bin/python class JiuGG:    def __init__(self):   self.imap = {}   self.pos5=0   self.pos1=0   self.pos2=0   self.pos3=0   self.pos4=0   self.pos7=0  def getalist(self,curr,summ):   isets=[]   for i in range(summ//2+1):    if( i == curr ):     conti

自娱小程序--九宫格

package com.arvey.eventbank.crash; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; public class JiuGG { /* * 构建数据结构 * Map<1-5整数(亦做数值使用),两层嵌夽整型SET(两个数值,升序) key + sum(set) = 15 *  */ Map<Integer,Set<S

【小程序】找出文件夹中特定后缀名的文件,并输出到txt

因为这几天在弄Qt,这些高级语言真的没啥,但是环境却是很难配好,要放入所有的Qt库文件,必须找出所有 .lib 后缀的文件,于是...这个程序应运而生. 这个小程序没有文件夹的嵌套,其实归根结底,程序的核心就是一些系统函数的调用和字符串处理. 直接上代码: //只需要输出当前文件夹里面的文件名 //不用管嵌套文件夹 #include "stdio.h" #include "windows.h" #include <string.h> //对不满足要求的文

将 Java 小程序迁移到 Microsoft J# 浏览器控件

程序|控件|浏览器 Visual J# .NET 小组 Microsoft Corporation 摘要:通过 Microsoft J# 浏览器控件,开发人员可以将所编写的在 Java 虚拟机上运行的 Java 小程序迁移到 .NET 框架.本文向开发人员介绍 J# 浏览器控件,并说明将 Java 小程序迁移到 .NET 框架的步骤.本文还讨论了其他一些主题,如 J# 浏览器控件的安全性和调试,以及当前版本中不受支持的功能. 下载 HTML 小程序到对象标记转换器 (95KB) 本文假设开发人员

《微信小程序:开发入门及案例详解》—— 2.4 框架页面文件

2.4 框架页面文件 小程序中一个框架页面包含4个文件,同一框架页面的这4个文件必须具有相同的路径与文件名,进入小程序时或页面跳转时,小程序会根据app.json配置的路径找到对应的资源进行渲染. ◇ .js文件:页面逻辑文件,必要项. ◇ .wxml文件:页面结构文件,必要项. ◇ .wxss文件:页面样式文件. ◇ .json文件:页面配置文件. 与框架主体文件相比框架页面文件多了一种页面结构文件,其余3个文件和框架主体文件的功能类同,下面我们一一讲解每个文件作用. 2.4.1 页面配置文件

用到xml文件-C#初学者学到XML文件部分写了个小程序

问题描述 C#初学者学到XML文件部分写了个小程序 错误 1 "CSharp.Study.XML.XmlOperation"不实现接口成员"CSharp.Study.XML.IFileOperation.Save(string, string)"."CSharp.Study.XML.XmlOperation.Save(string, string)"无法实现"CSharp.Study.XML.IFileOperation.Save(st

刚学的java,写了个压缩文件的小程序,一直没有成功

问题描述 刚学的java,写了个压缩文件的小程序,一直没有成功 public class f2Test { public static void main(String[] args){ frame02 f2=new frame02(); } } import java.awt.BorderLayout; import java.awt.Color; import java.awt.Image; import java.awt.Menu; import java.awt.MenuBar; imp

微信小程序-图片、录音、音频播放、音乐播放、视频、文件代码实例_javascript技巧

本文介绍了微信小程序的开发,主要包括图片.录音.音频播放.音乐播放.视频.文件,具体如下: 图片: wx.chooseImage(OBJECT) 从本地相册选择图片或使用相机拍照. OBJECT参数说明: 注:文件的临时路径,在小程序本次启动期间可以正常使用,如需持久保存,需在主动调用 wx.saveFile,在小程序下次启动时才能访问得到. 示例代码: wx.chooseImage({ count: 1, // 默认9 sizeType: ['original', 'compressed'],

hta文件介绍(编写小程序的好东东)_hta

总是想用script语言编写一些小程序,以前总是写成html格式的,可每次打开都会有安全提示,今天突然发现了如果保存成hta格式的文件,效果大不一样. 嘿嘿.. HTA是HTML Application的缩写(HTML应用程序),是软件开发的新概念,直接将HTML保存成HTA的格式,就是一个独立的应用软件,与VB.C++等程序语言所设计的软件没什么差别. 下面是一个HTA的例子: 复制代码 代码如下: <!-example1.hta--><html><head><